# for emacs: -*- mode: sh; -*-

#	Caenorhabditis remanei
#	From Wash U GSC
#	http://genome.wustl.edu/genome.cgi?GENOME=Caenorhabditis+remanei

#  $Id: caeRem2.txt,v 1.15 2008/05/23 23:07:19 hiram Exp $

#########################################################################
# DOWNLOAD SEQUENCE (DONE - 2007-03-16 - Hiram)
    ssh kkstore01
    mkdir /cluster/store10/caeRem2
    ln -s /cluster/store10/caeRem2 /cluster/data/caeRem2

    cd /cluster/data/caeRem2
    mkdir downloads
    cd downloads
    wget --timestamping \
ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_remanei/assembly/submitted/Caenorhabditis_remanei-1.0/ASSEMBLY

    wget --timestamping \
"ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_remanei/assembly/submitted/Caenorhabditis_remanei-1.0/README" \
	-O README.caeRem2

    wget --timestamping -m -np -nd \
"ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_remanei/assembly/submitted/Caenorhabditis_remanei-1.0/output/"

##########################################################################
## Create chrUn.agp file (DONE - 2007-03-16 - Hiram)
    ssh kkstore01
    cd /cluster/data/caeRem2/downloads
    cat << '_EOF_' > mkSuperLift.pl
#!/usr/bin/env perl

use strict;
use warnings;

my $start = 1;
my $end = 1;
my $itemCount = 1;
my $agpItemCount = 1;
my $curContig = "";
my $firstTime = 1;
my $scaffoldGapSize = 1000;
my $scafEnd = 1;
my $superStart = 0;
my $superEnd = 1;
my $superLength = 0;
my $chrUnSize = 161944676;
my $chrUnName = "chrUn";
my $ctgFragCount = 1;

open (CT,">caeRem2.ctgPos2.tab") or die "Can not write to caeRem2.ctgPos2.tab";

open (SC,">caeRem2.CtgScaf.agp") or die "Can not write to caeRem2.CtgScaf.agp";

open (AG,">caeRem2.chrUn.agp") or die "Can not write to caeRem2.chrUn.agp";

open (GL,">caeRem2.gold.tab") or die "Can not write to caeRem2.gold.tab";

open (FH,'zcat supercontigs.agp.gz|') or
	die "can not open zcat supercontigs.agp.gz";
while (my $line=<FH>) {
    chomp $line;
    if ($line =~ m/fragment/) {
	my ($name, $gStart, $gEnd, $gapCounter, $gapWN, $gapLen, $frag, $yesNo) =
	    split('\s+',$line);
	$end = $start + $gapLen - 1;
	$scafEnd += $gapLen;
	printf SC "chrUn\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n",
		    $start, $end, $agpItemCount, $gapWN, $gapLen, $frag, $yesNo;
	printf AG "chrUn\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n",
		    $start, $end, $agpItemCount++, $gapWN, $gapLen, $frag, $yesNo;
	$start = $end + 1;
    } else {
    my ($ctgName, $ctgStart, $ctgEnd, $ctgCounter, $ctgWN, $name, $cStart, $cEnd, $strand) =
	split('\s+',$line);
	my $ctgLen = $ctgEnd - $ctgStart + 1;
	my $cLen = $cEnd - $cStart + 1;
	die "not matching start, end:\n$line" if ($ctgLen != $cLen);
	if (!$firstTime) {
	    if ($ctgName ne $curContig) {
		$superLength = $superEnd - $superStart;
		$end = $start + $scaffoldGapSize - 1;
		printf SC "chrUn\t%d\t%d\t%d\tN\t%d\tscaffold\tno\n",
		    $start, $end, $agpItemCount, $scaffoldGapSize;
		printf AG "chrUn\t%d\t%d\t%d\tN\t%d\tscaffold\tno\n",
		    $start, $end, $agpItemCount++, $scaffoldGapSize;
		printf "%d\t%s\t%d\t%s\t%d\n",
			$superStart, $curContig, $superLength, $chrUnName,
				$chrUnSize;
		printf GL "%s\t%d\t%d\t%d\tW\t%s\t0\t%d\t+\n",
			$chrUnName, $superStart, $superEnd, $itemCount++,
			$curContig, $superLength;
		$start = $end + 1;
		$scafEnd = $cStart - 1;
		$superStart = $start - 1;
		$ctgFragCount = 1;
	    }
	} else {
	    $firstTime = 0;
	    $scafEnd = 0;
	}
	$scafEnd += $ctgLen;
	my $fragStart = $scafEnd - $ctgLen + 1;
	$end = $start + $ctgLen - 1;
	$superEnd = $end;
	my $ctgFragName = sprintf("%s.%d", $ctgName, $ctgFragCount++);
	my $ctgFragStart = 1;
	my $ctgFragEnd = $scafEnd - $fragStart + 1;
	printf SC "chrUn\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n",
		$start, $end, $agpItemCount, $ctgWN, $ctgFragName,
		$ctgFragStart, $ctgFragEnd, $strand;
	printf AG "chrUn\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n",
		$start, $end, $agpItemCount++, $ctgWN, $ctgName,
		$fragStart, $scafEnd, $strand;
	printf CT "%s\t%d\tchrUn\t%d\t%d\tW\n",
                $name, $ctgLen, $start-1, $end;

	$start = $end + 1;
	$curContig = $ctgName;
    }
}
close (FH);
close (AG);
close (SC);
$superLength = $superEnd - $superStart;
printf "%d\t%s\t%d\t%s\t%d\n",
    $superStart, $curContig, $superLength, $chrUnName, $chrUnSize;
printf GL "%s\t%d\t%d\t%d\tW\t%s\t0\t%d\t+\n",
	$chrUnName, $superStart, $superEnd, $itemCount++,
	$curContig, $superLength;
close (GL);
'_EOF_'
    # << happy emacs
    chmod +x mkSuperLift.pl
    ./mkSuperLift.pl > caeRem2.chrUn.lift

    qaToQac contigs.fa.qual.gz stdout \
        | qacAgpLift caeRem2.CtgScaf.agp stdin chrUn.qac

###########################################################################
## Initial genome build (DONE - 2007-03-29 - Hiram)
    ssh kkstore01
    cd /cluster/data/caeRem2
    cat << '_EOF_' > caeRem2.config.ra
# Config parameters for makeGenomeDb.pl:
db caeRem2
clade worm
genomeCladePriority 10
scientificName Caenorhabditis remanei
commonName C. remanei
assemblyDate Mar. 2006
assemblyLabel Washington University School of Medicine GSC C. remanei 1.0
orderKey 879
mitoAcc none
fastaFiles /cluster/data/caeRem2/downloads/supercontigs.fa.gz
agpFiles /cluster/data/caeRem2/downloads/caeRem2.chrUn.agp
# qualFiles /cluster/data/caeRem2/downloads/chrUn.qac
dbDbSpeciesDir worm
'_EOF_'
    # << happy emacs

    time nice -n +19 makeGenomeDb.pl caeRem2.config.ra \
	> makeGenomeDb.out 2>&1

    ssh hgwdev
    cd /cluster/data/caeRem2/downloads
    sed -e "s/agpFrag/chrUn_gold/" $HOME/kent/src/hg/lib/agpFrag.sql \
	> chrUn_gold.sql
    # edit that .sql file to add the bin column
    #	 bin smallint(6) NOT NULL default '0',

    hgLoadBed -sqlTable=chrUn_gold.sql caeRem2 chrUn_gold caeRem2.gold.tab
    hgLoadSqlTab caeRem2 ctgPos2 ~/kent/src/hg/lib/ctgPos2.sql \
	caeRem2.ctgPos2.tab

###########################################################################
## Repeat Masker (DONE - 2007-03-29 - Hiram)
    ssh kkstore01
    mkdir /cluster/data/caeRem2/bed/RepeatMasker
    cd /cluster/data/caeRem2/bed/RepeatMasker
    time nice -n +19 doRepeatMasker.pl caeRem2 \
	-buildDir=/cluster/data/caeRem2/bed/RepeatMasker > do.log 2>&1
    cd /cluster/data/caeRem2
    twoBitToFa caeRem2.rmsk.2bit stdout | faSize stdin \
	> caeRem2.rmsk.2bit.faSize
    cat caeRem2.rmsk.2bit.faSize
    #	161944676 bases (15087728 N's 146856948 real
    #	145425741 upper 1431207 lower) in 1 sequences in 1 files
    #	%0.88 masked total, %0.97 masked real
    ln -s caeRem2.rmsk.2bit caeRem2.2bit

    mkdir /san/sanvol1/scratch/worms/caeRem2 
    cp -p caeRem2.2bit /san/sanvol1/scratch/worms/caeRem2
    cp -p chrom.sizes /san/sanvol1/scratch/worms/caeRem2
    cp -p downloads/caeRem2.chrUn.lift /san/sanvol1/scratch/worms/caeRem2

###########################################################################
## Window Masker (DONE - 2007-03-29 - Hiram)
    ssh kolossus
    mkdir /cluster/data/caeRem2/bed/WindowMasker
    cd /cluster/data/caeRem2/bed/WindowMasker
    time nice -n +19 ~/kent/src/hg/utils/automation/doWindowMasker.pl \
	-workhorse kolossus \
	-buildDir=/cluster/data/caeRem2/bed/WindowMasker caeRem2 > do.log 2>&1 &
    #	real    12m19.926s
     twoBitToFa caeRem2.wmsk.sdust.2bit stdout \
	| faSize stdin > caeRem2.wmsk.sdust.2bit.faSize 2>&1
    cat caeRem2.wmsk.sdust.2bit.faSize
    #	161944676 bases (15087728 N's 146856948 real
    #	100232042 upper 46624906 lower) in 1 sequences in 1 files
    #	%28.79 masked total, %31.75 masked real

    ssh hgwdev
    cd /cluster/data/caeRem2/bed/WindowMasker
    hgLoadBed -strict caeRem2 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 1122372 elements of size 3
 
###########################################################################
## Simple Repeats (DONE - 2007-03-29 - Hiram)
    ssh titan
    mkdir /cluster/data/caeRem2/bed/simpleRepeat
    cd /cluster/data/caeRem2/bed/simpleRepeat
    time twoBitToFa ../../caeRem2.unmasked.2bit stdout \
	| nice -n +19 trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
	-bedAt=simpleRepeat.bed -tempDir=/scratch/tmp > do.out 2>&1
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
    #	real    71m21.810s
    ssh hgwdev
    cd /cluster/data/caeRem2/bed/simpleRepeat
    nice -n +19 hgLoadBed caeRem2 simpleRepeat \
      simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    #	Loaded 37049 elements of size 16
    featureBits caeRem2 simpleRepeat > fb.caeRem2.simpleRepeat.txt 2>&1
    #	8604824 bases of 146898439 (5.858%) in intersection
    ssh kkstore01
    cd /cluster/data/caeRem2
    cat bed/simpleRepeat/trfMask.bed \
        | twoBitMask -add -type=.bed caeRem2.rmsk.2bit stdin \
	caeRem2.rmskTrf.2bit
     twoBitToFa caeRem2.rmskTrf.2bit stdout | faSize stdin \
	> faSize.caeRem2.rmskTrf.txt
    cat faSize.caeRem2.rmskTrf.txt
    #	161944676 bases (15087728 N's 146856948 real
    #	145166254 upper 1690694 lower) in 1 sequences in 1 files
    #	%1.04 masked total, %1.15 masked real

###########################################################################
### prepare contig 2bit file for blastz runs
    ssh kkstore01
    mkdir /cluster/data/caeRem2/maskedContigs
    cd /cluster/data/caeRem2/maskedContigs
    ln -s ../downloads/caeRem2.chrUn.lift .
    ~/kent/src/hg/utils/lft2BitToFa.pl ../caeRem2.2bit \
	caeRem2.chrUn.lift > caeRem2.contigs.fa
    faToTwoBit caeRem2.contigs.fa caeRem2.contigs.2bit
    twoBitInfo caeRem2.contigs.2bit stdout | sort -k2nr > caeRem2.contigs.sizes
    cp -p caeRem2.contigs.2bit caeRem2.chrUn.lift caeRem2.contigs.sizes \
	/san/sanvol1/scratch/worms/caeRem2

############################################################################
## Default position (DONE - 2007-04-09 - Hiram)
    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="chrUn:34567-45678"
	where name="caeRem2";' hgcentraltest

############################################################################
## SWAP CB3 briggsae chain/net - (DONE - 2007-04-13 - Hiram)
    ssh kkstore02
    cd /cluster/data/cb3/bed/blastz.caeRem2.2007-04-13
    cat fb.cb3.chainCaeRem2Link.txt
    #	53199792 bases of 108433446 (49.062%) in intersection

    mkdir /cluster/data/caeRem2/bed/blastz.cb3.swap
    cd /cluster/data/caeRem2/bed/blastz.cb3.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 -bigClusterHub=kk \
	-swap \
	/cluster/data/cb3/bed/blastz.caeRem2.2007-04-13/DEF > swap.log 2>&1 &
    #	The typical failure:
    #	netChains: looks like previous stage was not successful (can't find
    #	[caeRem2.cb3.]all.chain[.gz]).

    time nice -n +19 doBlastzChainNet.pl -verbose=2 -bigClusterHub=kk \
	-continue=net -swap \
	/cluster/data/cb3/bed/blastz.caeRem2.2007-04-13/DEF > net.log 2>&1 &
    #	real    9m6.800s
    cat fb.caeRem2.chainCb3Link.txt
    #	63292118 bases of 146898439 (43.086%) in intersection

############################################################################
## SWAP caePb1 chain/net - (DONE - 2007-04-20 - Hiram)
    ssh kkstore02
    cd /cluster/data/caePb1/bed/blastz.caeRem2.2007-04-19
    cat fb.caePb1.chainCaeRem2Link.txt
    #	93139445 bases of 175247318 (53.147%) in intersection

    #	swap to caeRem2
    mkdir /cluster/data/caeRem2/bed/blastz.caePb1.swap
    cd /cluster/data/caeRem2/bed/blastz.caePb1.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/caePb1/bed/blastz.caeRem2.2007-04-19/DEF \
	-bigClusterHub=kk -swap > swap.log 2>&1 &
    cat fb.caeRem2.chainCaePb1Link.txt
    #	78402339 bases of 146898439 (53.372%) in intersection

############################################################################
## reset defaultDb to caeRem2 (DONE - 2007-04-21 - Hiram)
    ssh hgwdev
    hgsql -e 'update defaultDb set name="caeRem2" where genome="C. remanei";' \
	hgcentraltest

############################################################################
## SWAP ce4 chain/net - (DONE - 2007-04-02 - Hiram)
    cd /cluster/data/ce4/bed/blastz.caeRem2.2007-03-29
    cat fb.ce4.chainCaeRem2Link.txt
    #	45160539 bases of 100281244 (45.034%) in intersection

    mkdir /cluster/data/caeRem2/bed/blastz.ce4.swap
    cd /cluster/data/caeRem2/bed/blastz.ce4.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
	/cluster/data/ce4/bed/blastz.caeRem2.2007-03-29/DEF \
	-swap > swap.log 2>&1 &
    #	real    3m52.831s
    cat fb.caeRem2.chainCe4Link.txt
    #	52254587 bases of 146898439 (35.572%) in intersection

############################################################################
## SWAP priPac1 chain/net - (DONE - 2007-04-21 - Hiram)
    cd /cluster/data/priPac1/bed/blastz.caeRem2.2007-04-20
    cat fb.priPac1.chainCaeRem2Link.txt
    #	9047547 bases of 145948246 (6.199%) in intersection

    mkdir /cluster/data/caeRem2/bed/blastz.priPac1.swap
    cd /cluster/data/caeRem2/bed/blastz.priPac1.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 -bigClusterHub=kk \
	/cluster/data/priPac1/bed/blastz.caeRem2.2007-04-20/DEF \
	-swap > swap.log 2>&1 &
    #	less than 2 minutes
    cat fb.caeRem2.chainPriPac1Link.txt
    #	8971992 bases of 146898439 (6.108%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2007-04-23 - Hiram)
    # Use -repMatch=50 (based on size -- for human we use 1024, and 
    # C. remanei size is ~5.1% of human judging by gapless caeRem2 vs. hg18 
    # genome sizes from featureBits.
    ssh kolossus
    cd /cluster/data/caeRem2
    blat caeRem2.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/11.ooc -repMatch=50
    #	Wrote 46981 overused 11-mers to jkStuff/11.ooc
    cp -p jkStuff/11.ooc /san/sanvol1/scratch/worms/caeRem2

#########################################################################
# GENBANK AUTO UPDATE (DONE - 2007-04-23,25 - Hiram)
    # align with latest genbank process.
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # edit etc/genbank.conf to add caeRem2 just before caeRem1

# caeRem2 (C. remanei)
caeRem2.serverGenome = /cluster/data/caeRem2/caeRem2.2bit
caeRem2.clusterGenome = /iscratch/i/worms/caeRem2/caeRem2.2bit
caeRem2.ooc = /iscratch/i/worms/caeRem2/11.ooc
caeRem2.lift = /iscratch/i/worms/caeRem2/caeRem2.chrUn.lift
caeRem2.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
caeRem2.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
caeRem2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
caeRem2.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
caeRem2.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
caeRem2.refseq.mrna.native.load = yes
caeRem2.refseq.mrna.xeno.load  = yes
caeRem2.refseq.mrna.xeno.loadDesc = yes
caeRem2.genbank.mrna.xeno.load = yes
caeRem2.genbank.est.native.load = yes
caeRem2.genbank.est.native.loadDesc = no
caeRem2.downloadDir = caeRem2
caeRem2.perChromTables = no

    cvs ci -m "Added caeRem2." etc/genbank.conf
    # update /cluster/data/genbank/:
    make etc-update

    ssh kkstore01
    cd /cluster/data/genbank
    time nice -n +19 bin/gbAlignStep -initial caeRem2 &
    #	logFile: var/build/logs/2007.07.10-17:00:24.caeRem2.initalign.log
    #	real    255m14.345s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad caeRem2
    #   logFile: var/dbload/hgwdev/logs/2007.07.11-13:49:20.dbload.log
    #   17:10.13

    #	logFile: var/dbload/hgwdev/logs/2007.04.23-14:31:51.dbload.log
    #	real    6m57.557s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add caeRem2 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added caeRem2 - C. remanei" etc/align.dbs etc/hgwdev.dbs
    make etc-update

############################################################################
#  BLATSERVERS ENTRY (DONE - 2007-04-23 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("caeRem2", "blat10", "17788", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("caeRem2", "blat10", "17789", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

##########################################################################
## summarize chainLink measurements (2007-04-25 - Hiram)
# org       on caeRem2 on other
# caePb1    53.372     53.147
# cb3       43.086     49.062
# ce4       35.572     45.034
# priPac1    6.108      6.199

##########################################################################
#  Set default position to cover the 16S ribosomal RNA area
##	(DONE - 2007-04-26 - Hiram)
    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="chrUn:103531185-103538203"
	where name="caeRem2"' hgcentraltest

############################################################################
## Adding a Photograph, from Eric Hagg in email 2007-04-07 12:04
    #	-rw-r--r--  1 369754 Jan  1  2000 remanei_midsection.jpg
    #	remanei_midsection.jpg JPEG 716x972 DirectClass 361kb 0.000u 0:01
    mkdir /cluster/data/caeRem2/photograph
    cd /cluster/data/caeRem2/photograph
    convert -sharpen 0 -normalize \
	-geometry "400x400" "remanei_midsection.jpg" \
	Caenorhabditis_remanei.jpg
    #	check this .jpg file into the browser doc source tree images directory
    cvs add -kb Caenorhabditis_remanei.jpg
    cvs commit Caenorhabditis_remanei.jpg
    cp -p Caenorhabditis_remanei.jpg /usr/local/apache/htdocs/images

##########################################################################
## Creating downloads (DONE - 2007-05-01 - Hiram)
    #	There is only one chrom, make its trfMaskChrom file exist
    ssh hgwdev
    mkdir /cluster/data/caeRem2/bed/simpleRepeat/trfMaskChrom
    cd /cluster/data/caeRem2/bed/simpleRepeat/trfMaskChrom
    ## symlink didn't work here, the symlink ended up in the tar file
    cp -p ../trfMask.bed ./chrUn.bed
    cd  /cluster/data/caeRem2
    nice -n +19 /cluster/bin/scripts/makeDownloads.pl caeRem2 
    ## *!* EDIT THE README.txt FILES *!* ##
    ## Also add the WindowMasker business to bigZips

    cd /cluster/data/caeRem2/goldenPath/bigZips
    mkdir chrUn
    cp -p ../../bed/WindowMasker/windowmasker.sdust.bed.gz ./chrUn/
    gunzip chrUn/*.gz
    tar cvzf ./chromWMSdust.bed.tar.gz ./chrUn
    rm -fr chrUn
    md5sum *.gz > md5sum.txt
    ## add description to README.txt
    # chromWMSdust.bed.tar.gz - per-chrom bed files used as repeat areas
    #	identified by WindowMasker with the -sdust option.

    ## make the links to the liftOver files:
    cd /cluster/data/caeRem2/bed/liftOver
    md5sum *.gz > md5sum.txt
    cd /usr/local/apache/htdocs/goldenPath/caeRem2/liftOver
    ln -s /cluster/data/caeRem2/bed/liftOver/* .

##########################################################################
## Creating pushQ (DONE - 2007-05-01 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/caeRem2/pushQ
    cd /cluster/data/caeRem2/pushQ
    /cluster/bin/scripts/makePushQSql.pl caeRem2 > caeRem2.sql 2> stderr.out
    ## check the stderr.out for anything that needs to be fixed
    ## copy caeRem2.sql to hgwbeta:/tmp
    ## then on hgwbeta
    hgsql qapushq < caeRem2.sql

###########################################################################
# ELEGANS (ce4) PROTEINS TRACK (DONE -  Hiram - 2007-05-01)
    ssh kkstore01

    #	breaking up this target genome into manageable pieces
    mkdir /cluster/data/caeRem2/blastDb
    cd /cluster/data/caeRem2
    twoBitToFa caeRem2.unmasked.2bit temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
    #	210 pieces of 210 written
    rm temp.fa
    cd blastDb
    for i in *.fa
    do
	/cluster/bluearc/blast229/formatdb -i $i -p F
    done
    rm *.fa

    ##	copy to san for kluster access
    mkdir -p /san/sanvol1/scratch/worms/caeRem2/blastDb
    cd /san/sanvol1/scratch/worms/caeRem2/blastDb
    rsync -a --progress --stats /cluster/data/caeRem2/blastDb/ .

    ## create the query protein set
    mkdir -p /cluster/data/caeRem2/bed/tblastn.ce4SG
    cd /cluster/data/caeRem2/bed/tblastn.ce4SG
    echo  /san/sanvol1/scratch/worms/caeRem2/blastDb/*.nsq | xargs ls -S \
	| sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 210 query.lst

   # we want around 50000 jobs
   calc `wc /cluster/data/ce4/bed/blat.ce4SG/ce4SG.psl | awk "{print \\\$1}"`/\(50000/`wc query.lst | awk "{print \\\$1}"`\)
# 23192/(50000/210) = 97.406400

   mkdir -p /cluster/bluearc/worms/caeRem2/bed/tblastn.ce4SG/sgfa
   split -l 98 /cluster/data/ce4/bed/blat.ce4SG/ce4SG.psl \
	/cluster/bluearc/worms/caeRem2/bed/tblastn.ce4SG/sgfa/sg
   ln -s /cluster/bluearc/worms/caeRem2/bed/tblastn.ce4SG/sgfa sgfa
   cd sgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
     done
   cd ..
   ls -1S sgfa/*.fa > sg.lst
   mkdir -p /cluster/bluearc/worms/caeRem2/bed/tblastn.ce4SG/blastOut
   ln -s /cluster/bluearc/worms/caeRem2/bed/tblastn.ce4SG/blastOut ./blastOut
   for i in `cat sg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   
   cd /cluster/data/caeRem2/bed/tblastn.ce4SG
   cat << '_EOF_' > template
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
    # << happy emacs

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/caeRem2/blastDb.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/ce4/bed/blat.ce4SG/protein.lft warn $f.3

        if pslCheck -prot $3.tmp
        then
            mv $3.tmp $3
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome

    ssh pk
    cd /cluster/data/caeRem2/bed/tblastn.ce4SG
    gensub2 query.lst sg.lst template jobList
    para create jobList
#    para try, check, push, check etc.
# Completed: 49770 of 49770 jobs
# CPU time in finished jobs:     773169s   12886.15m   214.77h    8.95d  0.025 y
# IO & Wait Time:                173488s    2891.47m    48.19h    2.01d  0.006 y
# Average job time:                  19s       0.32m     0.01h    0.00d
# Longest finished job:            1778s      29.63m     0.49h    0.02d
# Submission to last job:          6918s     115.30m     1.92h    0.08d

    # do the cluster run for chaining
    ssh pk
    mkdir /cluster/data/caeRem2/bed/tblastn.ce4SG/chainRun
    cd /cluster/data/caeRem2/bed/tblastn.ce4SG/chainRun
    cat << '_EOF_' > template
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
    # << happy emacs

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin /cluster/bluearc/worms/caeRem2/bed/tblastn.ce4SG/blastOut/c.`basename $1`.psl)
'_EOF_'
    # << happy emacs
    chmod +x chainOne

    ls -1dS /cluster/bluearc/worms/caeRem2/bed/tblastn.ce4SG/blastOut/sg?? \
	> chain.lst
    gensub2 chain.lst single template jobList
    para create jobList
    para maxNode 30
    para try, check, push, check etc.
# Completed: 237 of 237 jobs
# CPU time in finished jobs:       7408s     123.46m     2.06h    0.09d  0.000 y
# IO & Wait Time:                  1415s      23.59m     0.39h    0.02d  0.000 y
# Average job time:                  37s       0.62m     0.01h    0.00d
# Longest finished job:             867s      14.45m     0.24h    0.01d
# Submission to last job:          1227s      20.45m     0.34h    0.01d

    ssh kkstore01
    cd /cluster/data/caeRem2/bed/tblastn.ce4SG/blastOut
    for i in sg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /scratch/tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq \
	> /cluster/data/caeRem2/bed/tblastn.ce4SG/blastCe4SG.psl
    cd ..
    pslCheck blastCe4SG.psl

    # load table 
    ssh hgwdev
    cd /cluster/data/caeRem2/bed/tblastn.ce4SG
    hgLoadPsl caeRem2 blastCe4SG.psl

    # check coverage
    featureBits caeRem2 blastCe4SG 
    #	19763359 bases of 146898439 (13.454%) in intersection
    featureBits caePb1 blastCe4SG
    #	22988044 bases of 175247318 (13.117%) in intersection
    featureBits priPac1 blastCe4SG 
    #	5617285 bases of 145948246 (3.849%) in intersection
    featureBits cb3 blastCe4SG
    #	18218293 bases of 108433446 (16.801%) in intersection
    featureBits ce4 sangerGene
    #	27906202 bases of 100281244 (27.828%) in intersection

    ssh kkstore01
    rm -rf /cluster/data/caeRem2/bed/tblastn.ce4SG/blastOut
    rm -rf /cluster/bluearc/worms/caeRem2/bed/tblastn.ce4SG
    rmdir /cluster/bluearc/worms/caeRem2/bed
    rmdir /cluster/bluearc/worms/caeRem2
#end tblastn

#############################################################################
# LIFTOVER TO caeRem3 (WORKING - 2008-05-23 - Hiram)
    ssh kkstore01
    screen -r -d	#	use screen to control this job
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl -debug caeRem2 caeRem3 \
      -ooc /cluster/data/caeRem2/jkStuff/11.ooc
    # Real run:
    cd /cluster/data/caeRem2/bed/blat.caeRem3.2008-05-23
    time nice -n +19 doSameSpeciesLiftOver.pl caeRem2 caeRem3 \
      -ooc /cluster/data/caeRem2/jkStuff/11.ooc
    #	real    8m15.850s

    #	a scaffold to scaffold lift over file was made manually
    #	for Sheldon McKay.
    #	see also the scripts left in:
    #	/cluster/data/caeRem2/bed/blat.caeRem3.supers/run.blat
    #	/cluster/data/caeRem2/bed/blat.caeRem3.supers/run.chain
#############################################################################
