# for emacs: -*- mode: sh; -*-

# Gasterosteus aculeatus - Broad Institute 1.0

# Jeff McKinnon gave permission for photo 8/16/06 and asked for notification 
# when the browser is public --
# e-mail: mckinnoj@uww.edu   http://facstaff.uww.edu/mckinnoj/mckinnon.html

#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AANH01

#########################################################################
# DOWNLOAD SEQUENCE (DONE 8/15/06 angie)
    ssh kkstore05
    mkdir /cluster/store12/gasAcu1
    ln -s /cluster/store12/gasAcu1 /cluster/data/gasAcu1
    mkdir /cluster/data/gasAcu1/downloads
    cd /cluster/data/gasAcu1/downloads
    foreach f (Stickleback1.0.agp \
               Stickleback1.0.agp.chromosome.fasta.gz \
               Stickleback1.0.agp.chromosome.qual.gz \
               assembly.agp \
               assembly.bases.gz \
               assembly.ps)
      wget --timestamping \
        ftp://ftp.broad.mit.edu/pub/assemblies/fish/stickleback/gasAcu1/$f
    end
    faSize Stickleback1.0.agp.chromosome.fasta.gz 
#463338706 bases (16726587 N's 446612119 real 446612119 upper 0 lower) in 105 sequences in 1 files
#Total size: mean 4412749.6 sd 1314228.9 min 83130 (XIII.20000001-20083130) max 5000000 (I.1-5000000) median 5000000
#N count: mean 159300.8 sd 213368.9
#U count: mean 4253448.8 sd 1311287.4
#L count: mean 0.0 sd 0.0
    # Doh!  Their chromosome fasta is chunked into 5M pieces.  Not so 
    # useful for us.
    # AGP and FA use different names, too.  AGP has "groupI" and FA has "I".
    # A lot of our software really wants to see "chr" at the beginning.
    # So make AGP with substituted names and use that to construct the 
    # chrom FA.
    sed -e 's/^group/chr/' Stickleback1.0.agp > UCSC.gasAcu1.agp
    # Their chromosome qual is chunked into 5M pieces too.  (There is also
    # an assembly.agp.qual.gz that has *scaffolds* chunked.)  Unchunk, 
    # relying on the fact that chunks are in order in the file:
    zcat Stickleback1.0.agp.chromosome.qual.gz \
    | perl -wpe 's/^>(\S+)\.1-\d+.*/>chr$1/;  s/^>\S+\.\d+-\d+.*\n$//;' \
      > UCSC.gasAcu1.qual


#########################################################################
# MAKE GENOME DB (DONE 8/17/06 angie)
    ssh kkstore05
    cd /cluster/data/gasAcu1
    cat > gasAcu1.config.ra <<EOF
# Config parameters for makeGenomeDb.pl:
db gasAcu1
clade vertebrate
genomeCladePriority 120
scientificName Gasterosteus aculeatus
commonName Stickleback
assemblyDate Feb. 2006
assemblyLabel Broad Institute gasAcu1 (1.0)
orderKey 40
mitoAcc 16356995
fastaFiles /cluster/data/gasAcu1/downloads/assembly.bases.gz
agpFiles /cluster/data/gasAcu1/downloads/UCSC.gasAcu1.agp
qualFiles /cluster/data/gasAcu1/downloads/UCSC.gasAcu1.qual
dbDbSpeciesDir stickleback
EOF

    # Run with -debug to see what it would do / check for issues:
    ~kent/src/utils/makeGenomeDb.pl gasAcu1.config.ra -debug -verbose=3

    # Run it for real:
    ~kent/src/utils/makeGenomeDb.pl gasAcu1.config.ra \
      >& makeGenomeDb.log & tail -f makeGenomeDb.log
    # Follow the directions at the end of the log after
#NOTES -- STUFF THAT YOU WILL HAVE TO DO --


#########################################################################
# REPEATMASKER (DONE (but might need better libs??) 8/16/06 angie)
    ssh kkstore05
    # Run -debug to create the dir structure and preview the scripts:
    ~/kent/src/utils/doRepeatMasker.pl gasAcu1 -verbose 3 -debug
    # Run it for real and tail the log:
    cd /cluster/data/gasAcu1/bed/RepeatMasker.2006-08-16
    ~/kent/src/utils/doRepeatMasker.pl gasAcu1 -verbose 3 \
      >& do.log & tail -f do.log
    # RM version info from do.log:
#grep version of RepeatMasker$ /cluster/bluearc/RepeatMasker/RepeatMasker
##    March 20 2006 (open-3-1-5) version of RepeatMasker
#grep RELEASE /cluster/bluearc/RepeatMasker/Libraries/RepeatMaskerLib.embl
#CC   RELEASE 20060315;                                            *
    # The cluster job crashed because no repeats were found on chrM 
    # so check out line+ failed.  NBD.  Manually made run.cluster/run.time
    # and continued:
    ~/kent/src/utils/doRepeatMasker.pl gasAcu1 -verbose 3 -continue cat \
      >>& do.log & tail -f do.log
    ssh hgwdev
    featureBits gasAcu1 rmsk
#11005259 bases of 446627861 (2.464%) in intersection
    # Wow, that is awfully skimpy.  Stickleback is a ~675M vertebrate genome,
    # probably pretty compact, but still...
    # RepeatMaskerLib.embl has only 2 stickleback-specific repeats.


#########################################################################
# REPEATMASKER - FUGU LIBS (DONE 8/17/06 angie)
    # RepeatMasker with stickleback as the species had very low coverage.
    # I found a paper on stickleback sex determination where they studied
    # repeats and they said they used fugu libs, so give that a try...
    # PMID: 15324658
    # http://www.sciencedirect.com/science?_ob=ArticleURL&_udi=B6VRT-4D56PX0-N&_coverDate=08%2F24%2F2004&_alid=435874407&_rdoc=1&_fmt=&_orig=search&_qd=1&_cdi=6243&_sort=d&view=c&_acct=C000059601&_version=1&_urlVersion=0&_userid=4428&md5=e5d62090265bd7b92b95f8f4cda5e079
    ssh kkstore05
    # Run -debug to create the dir structure and preview the scripts:
    ~/kent/src/utils/doRepeatMasker.pl gasAcu1 -species fugu -verbose 3 -debug
    # Run it for real and tail the log:
    cd /cluster/data/gasAcu1/bed/RepeatMasker.2006-08-17
    ~/kent/src/utils/doRepeatMasker.pl gasAcu1 -species fugu -verbose 3 \
      >& do.log & tail -f do.log
    # RM version info from do.log:
#grep version of RepeatMasker$ /cluster/bluearc/RepeatMasker/RepeatMasker
##    March 20 2006 (open-3-1-5) version of RepeatMasker
#grep RELEASE /cluster/bluearc/RepeatMasker/Libraries/RepeatMaskerLib.embl
#CC   RELEASE 20060315;                                            *
    featureBits gasAcu1 rmsk
#16563793 bases of 446627861 (3.709%) in intersection
    # Still seems rather slim, but it's an improvement.


#########################################################################
# SIMPLE REPEATS (TRF) (DONE 8/16/06 angie)
    ssh kolossus
    nice tcsh
    mkdir /cluster/data/gasAcu1/bed/simpleRepeat
    cd /cluster/data/gasAcu1/bed/simpleRepeat
    twoBitToFa ../../gasAcu1.unmasked.2bit stdout \
    | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
      -bedAt=simpleRepeat.bed -tempDir=/tmp \
    >& trf.log & tail -f trf.log
    # ~40 minutes
    # Make a filtered version for sequence masking:
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
    splitFileByColumn trfMask.bed trfMaskChrom

    # Load unfiltered repeats into the database:
    ssh hgwdev
    hgLoadBed gasAcu1 simpleRepeat \
      /cluster/data/gasAcu1/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    featureBits gasAcu1 simpleRepeat
#11062299 bases of 446627861 (2.477%) in intersection


#########################################################################
# MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 8/16/06 angie)
    ssh kolossus
    cd /cluster/data/gasAcu1
    time twoBitMask gasAcu1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed gasAcu1.2bit
    # This warning is OK -- the extra fields are not block coordinates.
#Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks).
#0.372u 0.716s 0:04.34 24.8%     0+0k 0+0io 0pf+0w
    # Link to it from /gbdb:
    ssh hgwdev
    ln -s /cluster/data/gasAcu1/gasAcu1.2bit /gbdb/gasAcu1/gasAcu1.2bit


#########################################################################
# MAKE DOWNLOADABLE / GOLDENPATH FILES (DONE 8/17/06 angie)
    cd /cluster/data/gasAcu1
    ~/kent/src/utils/makeDownloads.pl gasAcu1 -verbose=2 \
      >& jkStuff/downloads.log & tail -f jkStuff/downloads.log
    # Edit README.txt files when done:
# *** Edit each README.txt to resolve any notes marked with "***":
#     /cluster/data/gasAcu1/goldenPath/database/README.txt
#     /cluster/data/gasAcu1/goldenPath/bigZips/README.txt
#     /cluster/data/gasAcu1/goldenPath/chromosomes/README.txt


#########################################################################
# BLASTZ/CHAIN/NET DANRER4 (DONE 10/17/06 angie)
    ssh kkstore05
    mkdir /cluster/data/gasAcu1/bed/blastz.danRer4.2006-10-16
    cd /cluster/data/gasAcu1/bed/blastz.danRer4.2006-10-16
    cat << '_EOF_' > DEF
# Stickleback vs. Zebrafish

# Try "human-fugu" (more distant, less repeat-killed than mammal) params +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Stickleback gasAcu1
SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.2bit
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/gasAcu1/chrom.sizes

# QUERY: Zebrafish danRer4
SEQ2_DIR=/san/sanvol1/scratch/danRer4/danRer4.2bit
SEQ2_CTGDIR=/san/sanvol1/scratch/danRer4/danRer4ChrUnNAScafs.2bit
SEQ2_LIFT=/san/sanvol1/scratch/danRer4/liftNAandUnScaffoldsToChrom.lft
SEQ2_LEN=/cluster/data/danRer4/chrom.sizes
SEQ2_CTGLEN=/san/sanvol1/scratch/danRer4/chromsUnNAScafs.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/cluster/data/gasAcu1/bed/blastz.danRer4.2006-10-16
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF -chainMinScore=5000 -chainLinearGap=loose \
      -bigClusterHub pk \
      -blastzOutRoot /cluster/bluearc/gasAcu1danRer4 >& do.log &
    tail -f do.log
    ln -s blastz.danRer4.2006-10-16 /cluster/data/gasAcu1/bed/blastz.danRer4
    nice featureBits gasAcu1 -chrom=chrI chainDanRer4Link
#9296061 bases of 27557028 (33.734%) in intersection

    time nice -n 19 featureBits gasAcu1 chainDanRer4Link \
	> fb.gasAcu1.chainDanRer4Link.txt 2>&1
    #	157049518 bases of 446627861 (35.163%) in intersection

    #	I'm curious what a swap would look like on this one
    #	(DONE - 2006-12-08 - Hiram)
    ssh kkstore04
    mkdir /cluster/data/danRer4/bed/blastz.gasAcu1.swap
    cd /cluster/data/danRer4/bed/blastz.gasAcu1.swap
    time doBlastzChainNet.pl \
	/cluster/data/gasAcu1/bed/blastz.danRer4.2006-10-16/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-verbose=2 -swap -bigClusterHub=pk \
	-blastzOutRoot /cluster/bluearc/gasAcu1danRer4 > swap.log 2>&1 &
    #	real    38m51.527s

    ssh hgwdev
    cd /cluster/data/danRer4/bed/blastz.gasAcu1.swap
    time nice -n 19 featureBits danRer4 chainGasAcu1Link \
	> fb.danRer4.chainGasAcu1Link.txt 2>&1
    #	213083380 bases of 1626093931 (13.104%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET TETNIG1 (DONE 10/17/06 angie)
    ssh kkstore05
    mkdir /cluster/data/gasAcu1/bed/blastz.tetNig1.2006-10-17
    cd /cluster/data/gasAcu1/bed/blastz.tetNig1.2006-10-17
    cat << '_EOF_' > DEF
# Stickleback vs. Tetraodon

# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Stickleback gasAcu1
SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.2bit
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/gasAcu1/chrom.sizes

# QUERY: Tetraodon tetNig1
SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit
SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/chromsAndScafs/tetNig1ChromsRandomScafs.2bit
SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.lft
SEQ2_LEN=/cluster/data/tetNig1/chrom.sizes
SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/cluster/data/gasAcu1/bed/blastz.tetNig1.2006-10-17
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF -chainMinScore=5000 -chainLinearGap=loose \
      -bigClusterHub pk \
      -blastzOutRoot /cluster/bluearc/gasAcu1tetNig1 >& do.log &
    tail -f do.log
    ln -s blastz.tetNig1.2006-10-17 /cluster/data/gasAcu1/bed/blastz.tetNig1
    nice featureBits gasAcu1 -chrom=chrI chainTetNig1Link
#11175314 bases of 27557028 (40.553%) in intersection

    time nice -n 19 featureBits gasAcu1 chainTetNig1Link \
	> fb.gasAcu1.chainTetNig1Link.txt 2>&1 &
    #	187779775 bases of 446627861 (42.044%) in intersection

    #	I'm curious what a swap would look like on this one
    #	(DONE - 2006-12-08 - Hiram)
    ssh kkstore04
    mkdir /cluster/data/tetNig1/bed/blastz.gasAcu1.swap
    cd /cluster/data/tetNig1/bed/blastz.gasAcu1.swap
    doBlastzChainNet.pl \
	/cluster/data/gasAcu1/bed/blastz.tetNig1.2006-10-17/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-verbose=2 -swap -bigClusterHub pk \
	-blastzOutRoot /cluster/bluearc/gasAcu1tetNig1 > swap.log 2>&1 &
    ssh hgwdev
    cd /cluster/data/tetNig1/bed/blastz.gasAcu1.swap
    time nice -n 19 featureBits tetNig1 chainGasAcu1Link \
	> fb.tetNig1.chainGasAcu1Link.txt 2>&1 &
    #	171216660 bases of 342403326 (50.004%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET HG18 (DONE 10/17/06 angie)
    ssh kkstore05
    mkdir /cluster/data/gasAcu1/bed/blastz.hg18.2006-10-17
    cd /cluster/data/gasAcu1/bed/blastz.hg18.2006-10-17
    cat << '_EOF_' > DEF
# Stickleback vs. Human

# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Stickleback gasAcu1
SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.2bit
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/gasAcu1/chrom.sizes

# QUERY: Human hg18
SEQ2_DIR=/scratch/hg/hg18/hg18.2bit
SEQ2_LEN=/cluster/data/hg18/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/cluster/data/gasAcu1/bed/blastz.hg18.2006-10-17
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
      -bigClusterHub pk \
      -blastzOutRoot /cluster/bluearc/gasAcu1hg18 >& do.log &
    tail -f do.log
    ln -s blastz.hg18.2006-10-17 /cluster/data/gasAcu1/bed/blastz.hg18
    nice featureBits gasAcu1 -chrom=chrI chainHg18Link
#2973001 bases of 27557028 (10.789%) in intersection

    ssh hgwdev
    cd /cluster/data/gasAcu1/bed/blastz.hg18.2006-10-17
    time nice -n 19 featureBits gasAcu1 chainHg18Link \
	> fb.gasAcu1.chainHg18Link.txt 2>&1 &
    #	50705450 bases of 446627861 (11.353%) in intersection

    #	Been swapped
    ssh hgwdev
    cd /cluster/data/hg18/bed/blastz.gasAcu1.swap
    time nice -n 19 featureBits hg18 chainGasAcu1Link \
	> fb.hg18.chainGasAcu1Link.txt 2>&1 &
    #	55424609 bases of 2881515245 (1.923%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET MM8 (DONE 10/17/06 angie)
    ssh kkstore05
    mkdir /cluster/data/gasAcu1/bed/blastz.mm8.2006-10-17
    cd /cluster/data/gasAcu1/bed/blastz.mm8.2006-10-17
    cat << '_EOF_' > DEF
# Stickleback vs. Mouse

# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Stickleback gasAcu1
SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.2bit
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/gasAcu1/chrom.sizes

# QUERY: Mouse mm8
SEQ2_DIR=/scratch/hg/mm8/mm8.2bit
SEQ2_LEN=/cluster/data/mm8/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/cluster/data/gasAcu1/bed/blastz.mm8.2006-10-17
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
      -bigClusterHub pk \
      -blastzOutRoot /cluster/bluearc/gasAcu1mm8 >& do.log &
    tail -f do.log
    ln -s blastz.mm8.2006-10-17 /cluster/data/gasAcu1/bed/blastz.mm8
    nice featureBits gasAcu1 -chrom=chrI chainMm8Link
#2981604 bases of 27557028 (10.820%) in intersection

    ssh hgwdev
    cd /cluster/data/gasAcu1/bed/blastz.mm8.2006-10-17
    time nice -n 19 featureBits gasAcu1 chainMm8Link \
	> fb.gasAcu1.chainMm8Link.txt 2>&1 &

#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE 11/14/06 angie)
    # Use -repMatch=200 (based on size -- for human we use 1024, and 
    # stickleback size is ~15% of human judging by gapless gasAcu1 vs. hg18 
    # genome sizes from featureBits.  Bump it up a bit to be conservative.
    ssh kolossus
    blat /cluster/data/gasAcu1/gasAcu1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/gasAcu1/11.ooc -repMatch=200
#Wrote 5700 overused 11-mers to /cluster/bluearc/gasAcu1/11.ooc


#########################################################################
# GENBANK AUTO UPDATE (DONE 11/15/06 angie)
    # Make a liftAll.lft that specifies 5M chunks for genbank:
    ssh kkstore05
    cd /cluster/data/gasAcu1
    simplePartition.pl gasAcu1.2bit 5000000 /tmp/gasAcu1P
    cat /tmp/gasAcu1P/*/*.lft > jkStuff/liftAll.lft
    rm -r /tmp/gasAcu1P

    # align with latest genbank process.
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # edit etc/genbank.conf to add gasAcu1 just after dm2...

# gasAcu1 (G. Aculeatus)
gasAcu1.serverGenome = /cluster/data/gasAcu1/gasAcu1.2bit
gasAcu1.clusterGenome = /iscratch/i/gasAcu1/gasAcu1.2bit
gasAcu1.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
gasAcu1.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
gasAcu1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
gasAcu1.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
gasAcu1.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
gasAcu1.ooc = /cluster/bluearc/gasAcu1/11.ooc
gasAcu1.lift = /cluster/data/gasAcu1/jkStuff/liftAll.lft
gasAcu1.genbank.mrna.xeno.load = yes
gasAcu1.downloadDir = gasAcu1

    cvs ci -m "Added gasAcu1." etc/genbank.conf
    # update /cluster/data/genbank/:
    make etc-update

    # Edit src/lib/gbGenome.c to add new species.
    cvs ci -m "Added G. aculeatus (stickleback)." src/lib/gbGenome.c
    make install-server

    ssh kkstore02
    cd /cluster/data/genbank
    nice bin/gbAlignStep -initial gasAcu1 &
    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    nice ./bin/gbDbLoadStep -drop -initialLoad gasAcu1 &

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add gasAcu1 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added gasAcu1." etc/align.dbs etc/hgwdev.dbs
    make etc-update


#########################################################################
# EXTRACT CHROM TO SCAFFOLD LIFTUP (DONE 3/6/07 angie)
# Ensembl genes use scaffold_* instead of chrUn.  Broad doesn't seem to 
# provide a scaffold->chrom mapping in their download files, but here's
# a way to extract one from ctg->chrom and ctg->scaffold agp...
    cd /cluster/data/gasAcu1/downloads
    # Distill contig coordinates to a single field that will be the same
    # for ctg->chrom and ctg->scaf maps:
    awk '$5 != "N" {print $6 ":" $7 ":" $8 "\t" $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $9;}' \
      UCSC.gasAcu1.agp \
    | sort > chrom.tmp
    awk '$5 != "N" {print $6 ":" $7 ":" $8 "\t" $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $9;}' \
      assembly.agp \
    | sort > scaf.tmp
    # Join the maps into what is AGP except it has a bunch of lines for 
    # each scaffold that must be collapsed into one line per scaffold:
    join -t "	" -o '1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 1.7' chrom.tmp scaf.tmp \
    | sort -k 1,1 -k 2n,2n \
      > tmp.agp
    # Make sure the join went right (same #lines in all):
    wc -l chrom.tmp scaf.tmp tmp.agp
#  16967 chrom.tmp
#  16967 scaf.tmp
#  16967 tmp.agp
    # Now collapse the many per-contig lines for each scaffold into a single
    # line per scaffold.  This produces gapless AGP with sequence numbers 
    # that aren't consecutive (but still monotonically increasing):
    perl -we 'while (<>) { \
                chomp; @w = split; \
                if (! defined $lastScaf) { \
                  ($chr, $chrS, $chrE, $seq, $type, \
                   $lastScaf, $firstScafS, $firstScafE, $strand) = @w; \
                  $lastScafS = $firstScafS;  $lastScafE = $firstScafE; \
                } elsif ($lastScaf ne $w[5]) { \
                  $scafS = ($firstScafS < $lastScafS) ? $firstScafS : $lastScafS;\
                  $scafE = ($firstScafE > $lastScafE) ? $firstScafE : $lastScafE;\
                  print "$chr\t$chrS\t$chrE\t$seq\t$type\t" . \
                        "$lastScaf\t$scafS\t$scafE\t$strand\n"; \
                  ($chr, $chrS, $chrE, $seq, $type, \
                   $lastScaf, $firstScafS, $firstScafE, $strand) = @w; \
                  $lastScafS = $firstScafS;  $lastScafE = $firstScafE; \
                } else { \
                  ($thisChr,undef,$chrE,undef,undef, \
                   undef,$lastScafS,$lastScafE,undef) = @w; \
                  die "$lastScaf $lastScafE: $thisChr != $chr" \
                    if ($thisChr ne $chr); \
                } \
              } \
              $scafS = ($firstScafS < $lastScafS) ? $firstScafS : $lastScafS;\
              $scafE = ($firstScafE > $lastScafE) ? $firstScafE : $lastScafE;\
              print "$chr\t$chrS\t$chrE\t$seq\t$type\t" . \
                    "$lastScaf\t$scafS\t$scafE\t$strand\n"; \
              ' tmp.agp \
      > UCSC.chromToScaffoldSansGaps.agp
    # fortunately agpToLift doesn't need gap lines in its input, just frags:
    agpToLift -revStrand < UCSC.chromToScaffoldSansGaps.agp \
      > ../jkStuff/UCSC.chromToScaffoldSansGaps.lft


#########################################################################
# ENSEMBL GENES (DONE 11/27/06 angie - RELOADED 3/8/07 angie)
    ssh hgwdev
    mkdir /cluster/data/gasAcu1/bed/ensembl
    cd /cluster/data/gasAcu1/bed/ensembl
    # Go to www.ensembl.org and click around their evolving interface 
    # to get the following types of data:
    # 1. a tab-separated file that relates Ensembl gene, transcript and
    #    protein IDs.  Save as ensGtp.txt.gz  
    # 2. peptide fasta with transcript IDs in header -> ensemblPep.fa.gz

    # They have started dumping GTF files so we can download that directly:
    wget 'ftp://ftp.ensembl.org/pub/current_gasterosteus_aculeatus/data/gtf/*'

    # Add "chr" to front of each line in the gene data gtf file to make 
    # it compatible with our software.
    # Also need to lift scaffold coords up to chrUn:
    gunzip -c Gasterosteus_aculeatus.BROADS1.43.gtf.gz \
    | sed -e 's/^group/chr/; s/^MT/chrM/;' \
    | liftUp -type=.gtf stdout ../../jkStuff/UCSC.chromToScaffoldSansGaps.lft \
       carry stdin \
    > ensGene.gtf
    ldHgGene -gtf -genePredExt gasAcu1 ensGene \
      /cluster/data/gasAcu1/bed/ensembl/ensGene.gtf

    # strip header line:
    gunzip -c ensGtp.txt.gz | tail +2 > ensGtp.txt
    hgLoadSqlTab gasAcu1 ensGtp ~/kent/src/hg/lib/ensGtp.sql ensGtp.txt

    gunzip -c ensemblPep.fa.gz \
    | perl -wpe 's/\|.*//' \
    > ensPep.fa
    # ONE TIME ONLY... seven protein sequences were chopped up in the file 
    # from BioMart and the fragments appeared as competing duplicate 
    # sequences.  Ensembl helpdesk is still looking at it but sent the 
    # correct whole sequences for the seven, so I'm manually editing those 
    # in.  The ID's are ENSGACT00000000021, ENSGACT00000000176, 
    # ENSGACT00000017572, ENSGACT00000011000, ENSGACT00000000042, 
    # ENSGACT00000000087, ENSGACT00000022185.
    hgPepPred gasAcu1 generic ensPep ensPep.fa


#########################################################################
#########################################################################

#########################################################################
# BLASTZ/CHAIN/NET FR1 (DONE - 2006-12-20 - Hiram)
##  no chrUn in gasAcu1, and align to fr1 scaffolds,
##	results lifted to fr1 chrUn coordinates
    ssh kkstore04
    mkdir /cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08
    cd /cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08
    cat << '_EOF_' > DEF
# Stickleback vs. Fugu

# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Stippleback gasAcu1, no chrUn in this run
SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.sizes
SEQ1_CHUNK=40000000
SEQ1_LIMIT=30
SEQ1_LAP=0

# QUERY: Fugu fr1
#       Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
SEQ2_DIR=/san/sanvol1/scratch/fr1/chrUn.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/fr1/fr1.UnScaffolds.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/fr1/scaffold.sizes
SEQ2_LIFT=/san/sanvol1/scratch/fr1/UnScaffolds/ordered.lft
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk \
	-blastzOutRoot /cluster/bluearc/gasAcu1Fr1 > do.log 2>&1 &
    #	real    272m33.395s
    #	user    0m0.189s
    #	sys     0m0.163s

    ssh hgwdev
    cd /cluster/data/gasAcu1/bed
    ln -s blastz.fr1.2006-12-08 blastz.fr1
    cd blastz.fr1
    time nice -n 19 featureBits gasAcu1 chainFr1Link \
	> fb.gasAcu1.chainFr1Link.txt 2>&1 &
    #	146655780 bases of 446627861 (32.836%) in intersection

    mkdir /cluster/data/fr1/bed/blastz.gasAcu1.swap
    cd /cluster/data/fr1/bed/blastz.gasAcu1.swap
    time doBlastzChainNet.pl \
	/cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08/DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -bigClusterHub=pk > do.log 2>&1 &

    time doBlastzChainNet.pl \
	/cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08/DEF \
	-continue=net -chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -bigClusterHub=pk > net.log 2>&1 &
    #	real    24m33.761s
    #	user    0m0.125s
    #	sys     0m0.080s

    ssh hgwdev
    cd /cluster/data/fr1/bed/blastz.gasAcu1.swap
    time nice -n 19 featureBits fr1 chainGasAcu1Link \
	> fb.fr1.chainGasAcu1Link.txt 2>&1 &
    #	148005715 bases of 315518167 (46.909%) in intersection

#######################################################################
## Swap of oryLat1 alignments to gasAcu1 (DONE - 2006-12-08 - Hiram)
## This sequence duplicated in oryLat1.txt too
    ssh kkstore04
    cd /cluster/data/gasAcu1/bed
    rm blastz.oryLat1.swap
    mkdir /cluster/data/gasAcu1/bed/blastz.oryLat1.swap
    cd /cluster/data/gasAcu1/bed/blastz.oryLat1.swap
    time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \
	/cluster/data/oryLat1/bed/blastz.gasAcu1.2006-12-08/DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \
	-swap > swap.log 2>&1 &
    #	real    26m32.549s

    ssh hgwdev
    cd /cluster/data/gasAcu1/bed/blastz.oryLat1.swap
    nice -n 19 featureBits -noRandom gasAcu1 chainOryLat1Link \
	>fb.gasAcu1.oryLat1.txt 2>&1 &
    #	148335558 bases of 391398261 (37.899%) in intersection

###########################################################################
## checking measurements to determine which fish is close to which fish
## 2006-12-08 - Hiram

#                         featureBits chainLink measures
#                                           chainGasAcu1Link  chain linearGap
#    distance                      on gasAcu1    on other   minScore
#  1  0.xxxx - tetraodon tetNig1   (% 42.044) (% 50.004)       5000     loose
#  3  0.xxxx - zebrafish danRer4   (% 35.163) (% 13.104)       5000     loose
#  4  0.xxxx - fugu fr1            (% 32.836) (% 46.909)       2000     loose
#  5  0.xxxx - human hg18          (% 11.353) (%  1.923)       2000     loose
#  6  0.xxxx - mouse mm8           (% 11.070) (%  x.923)       2000
# from oryLat1.txt:
#  2  0.xxxx - medaka oryLat1      (% 37.899) (% 28.110)       2000     loose
######
danRer4 #	157049518 bases of 446627861 (35.163%) in intersection
tetNig1 #	187779775 bases of 446627861 (42.044%) in intersection
hg18 #	50705450 bases of 446627861 (11.353%) in intersection
on hg18 #	55424609 bases of 2881515245 (1.923%) in intersection
on tetNig1 #	171216660 bases of 342403326 (50.004%) in intersection
on danRer4 #	213083380 bases of 1626093931 (13.104%) in intersection
fr1 #	146655780 bases of 446627861 (32.836%) in intersection
on fr1 #	148005715 bases of 315518167 (46.909%) in intersection

#######################################################################
## BLASTZ oryLat1 chrUn contigs/scaffolds only (DONE - 2006-12-20 - Hiram)
##  This was an experiment, not used
    ssh kkstore05
    mkdir /cluster/data/gasAcu1/bed/blastz.oryLat1.Un.2006-12-20
    cd /cluster/data/gasAcu1/bed/blastz.oryLat1.Un.2006-12-20
    cat << '_EOF_' > DEF
# Stickleback vs. Medaka, both chrUn random contigs/scaffolds only

# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
    
# TARGET: Stickleback gasAcu1 chrUn only
#	chrUn in contigs for this alignment run
#	The largest is 418,000 bases and there are 5,000 of them.
SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUn.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUn.sdTrf.sizes
SEQ1_CTGDIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUnContigsOnly.sdTrf.2bit
SEQ1_CTGLEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUnContigsOnly.sdTrf.sizes
SEQ1_LIFT=/san/sanvol1/scratch/gasAcu1/chrUn.extraCloneGap.lift
SEQ1_CHUNK=1000000
SEQ1_LIMIT=100
SEQ1_LAP=0

# QUERY: Medaka oryLat1 chrUn only
#       chrUn in scaffolds for this alignment run
#	The largest scaffold is only 76,000 bases, there are 36,000 of them
#	and a total size of 119 million.
SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.chrUn.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/oryLat1/oryLat1.chrUn.sdTrf.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1.UnScaffoldsOnly.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1.UnScaffoldsOnly.sdTrf.sizes
SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift
SEQ2_CHUNK=1000000
SEQ2_LIMIT=100
SEQ2_LAP=0


BASE=/cluster/data/gasAcu1/bed/blastz.oryLat1.Un.2006-12-20
TMPDIR=/scratch/tmp
'_EOF_'

    time doBlastzChainNet.pl -verbose=2 DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-blastzOutRoot /cluster/bluearc/gasAcu1OryLat1 \
	-bigClusterHub=kk > do.log 2>&1 &
    #	real    106m42.410s
    #	user    0m0.087s
    #	sys     0m0.062s
    #	Had some difficulty with the kk kluster, continuing after getting the
    #	jobs completed
    time doBlastzChainNet.pl -verbose=2 DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-blastzOutRoot /cluster/bluearc/gasAcu1OryLat1 \
	-continue=cat -bigClusterHub=kk > cat.log 2>&1 &

#######################################################################
## BLASTZ Stickleback chroms vs Medaka random contigs
##	(DONE - 2006-12-20 - Hiram)
##	This was an experiment, not used
    ssh kkstore05
    mkdir /cluster/data/gasAcu1/bed/blastz.oryLat1.ChrUn.2006-12-20
    cd /cluster/data/gasAcu1/bed/blastz.oryLat1.ChrUn.2006-12-20
    cat << '_EOF_' > DEF
# Stickleback chroms vs. Medaka random contigs

# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
    
# TARGET: Stickleback gasAcu1 no chrUn sequence, only chroms
#       The largest is 32 million bases, there are 22 of them
SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.sizes
SEQ1_CHUNK=40000000
SEQ1_LAP=10000

# QUERY: Medaka oryLat1 chrUn only
#       chrUn in scaffolds for this alignment run
#       The largest scaffold is only 76,000 bases, there are 36,000 of them
#       and a total size of 119 million.
SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.chrUn.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/oryLat1/oryLat1.chrUn.sdTrf.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1.UnScaffoldsOnly.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1.UnScaffoldsOnly.sdTrf.sizes
SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift
SEQ2_CHUNK=1000000
SEQ2_LIMIT=40
SEQ2_LAP=0


BASE=/cluster/data/gasAcu1/bed/blastz.oryLat1.ChrUn.2006-12-20
TMPDIR=/scratch/tmp
'_EOF_'

    time doBlastzChainNet.pl -verbose=2 DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-blastzOutRoot /cluster/bluearc/gasAcu1ChrOryLat1Un \
	-bigClusterHub=kk > do.log 2>&1 &

#######################################################################
## BLASTZ Stickleback chroms vs Medaka chroms and random contigs
##	(DONE - 2006-12-22 - Hiram)
    ssh kkstore05
    mkdir /cluster/data/gasAcu1/bed/blastz.oryLat1.2006-12-22
    cd /cluster/data/gasAcu1/bed/blastz.oryLat1.2006-12-22
    cat << '_EOF_' > DEF
# Stickleback chroms vs. Medaka random contigs

# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
    
# TARGET: Stickleback gasAcu1 no chrUn sequence, only chroms
#       The largest is 32 million bases, there are 22 of them
SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.sizes
SEQ1_CHUNK=40000000
SEQ1_LAP=10000

# QUERY: Medaka oryLat1 chroms and chrUn in scaffolds
#       The largest scaffold is only 76,000 bases, there are 36,000 of them
#       and a total size of 119 million.
#       The biggest chrom is just under 40 million
SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift
SEQ2_CHUNK=10000000
SEQ2_LIMIT=20
SEQ2_LAP=0

BASE=/cluster/data/gasAcu1/bed/blastz.oryLat1.2006-12-22
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-blastzOutRoot /cluster/bluearc/gasAcu1ChrOryLat1ChrUn \
	-bigClusterHub=pk > do.log 2>&1 &
    #	real    132m54.403s
    #	user    0m0.213s
    #	sys     0m0.138s

    ssh hgwdev
    cd /cluster/data/gasAcu1/bed/blastz.oryLat1.2006-12-22
    time featureBits gasAcu1 chainOryLat1Link \
	> fb.gasAcu1.chainOryLat1Link.txt 2>&1
    #	164155231 bases of 446627861 (36.754%) in intersection

#########################################################################
##  Reorder Fish organisms (DONE - 2006-12-22 - Hiram)
    hgsql -h genome-testdb hgcentraltest \
	-e "update dbDb set orderKey = 470 where name = 'gasAcu1';"

#########################################################################
## 7-Way Multiz (DONE - 2006-12-22 - Hiram)
##  replaced by 8-Way done below
    ssh hgwdev
    mkdir /cluster/data/gasAcu1/bed/multiz7way
    cd /cluster/data/gasAcu1/bed/multiz7way
    #	Constructed this tree by pruining the 17-way tree from mm8
    #	multiz17way, and then adding in gasAcu1 and oryLat1 branch
    #	Arbitrarily set 0.2 distances for this added branch
    #	All other distances remain as specified in the 17way.nh
    cat << '_EOF_' > 7way.nh
((human_hg18:0.054922,mouse_mm8:0.412790):0.754413,
(
    ((tetraodon_tetNig1:0.199381,fugu_fr1:0.239894):0.2,
    (stickleback_gasAcu1:0.2,medaka_oryLat1:0.2):0.2):0.292961,
zebrafish_danRer4:0.782561):0.156067);
'_EOF_'
    # << happy emacs

    /cluster/bin/phast/draw_tree 7way.nh > 7way.ps
    /cluster/bin/phast/all_dists 7way.nh > 7way.distances.txt
    #	Use this output to create the table below
    grep -y gasAcu1 7way.distances.txt | sort -k3,3n
#
#	If you can fill in all the numbers in this table, you are ready for
#	the multiple alignment procedure
#
#                         featureBits chainLink measures
#                                           chainGasAcu1Link  chain linearGap
#    distance                      on gasAcu1    on other   minScore
#  1  0.4000 - medaka oryLat1      (% 38.754) (% 27.044)       2000     loose
#  2  0.7994 - tetraodon tetNig1   (% 42.044) (% 50.004)       5000     loose
#  3  0.8399 - fugu fr1            (% 32.836) (% 49.909)       2000     loose
#  4  1.4755 - zebrafish danRer4   (% 35.163) (% 13.104)       5000     loose
#  6  1.6584 - human hg18          (% 11.353) (%  1.923)       2000     loose
#  7  2.0162 - mouse mm8           (% 11.070) (%  2.056)       2000

    cd /cluster/data/gasAcu1/bed/multiz7way
    #	bash shell syntax here ...
    export H=/cluster/data/gasAcu1/bed
    mkdir mafLinks
    for G in oryLat1 tetNig1 fr1 danRer4 hg18 mm8
    do
	mkdir mafLinks/$G
	if [ ! -d ${H}/blastz.${G}/mafNet ]; then
	echo "missing directory blastz.${G}/mafNet"
		exit 255
	fi
	ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
    done

    #	Copy MAFs to some appropriate NFS server for kluster run
    ssh kkstore05
    mkdir /san/sanvol1/scratch/gasAcu1/multiz7way
    cd /san/sanvol1/scratch/gasAcu1/multiz7way
    time rsync -a --copy-links --progress \
	/cluster/data/gasAcu1/bed/multiz7way/mafLinks/ .

    mkdir penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn

    # the autoMultiz cluster run
    ssh pk
    cd /cluster/data/gasAcu1/bed/multiz7way/

    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	7way.nh > tmp.nh
    echo `cat tmp.nh` > tree-commas.nh
    echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
    sed 's/[()]//g; s/,/ /g' tree.nh > species.lst

    mkdir run maf
    cd run

    #	NOTE: you need to set the db and multiz dirname properly in this script
    cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = gasAcu1
set c = $1
set maf = $2
set binDir = /san/sanvol1/scratch/$db/multiz7way/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/multiz7way
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
    set in = $pairs/$s/$c.maf
    set out = $db.$s.sing.maf
    if ($s == $db) then
	continue
    endif
    if (-e $in.gz) then
	zcat $in.gz > $out
    else if (-e $in) then
	cp $in $out
    else
	echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod +x autoMultiz

cat  << '_EOF_' > template
#LOOP
autoMultiz $(root1) {check out line+ /cluster/data/gasAcu1/bed/multiz7way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << happy emacs

    awk '{print $1}' /cluster/data/gasAcu1/chrom.sizes > chrom.lst
    gensub2 chrom.lst single template jobList
    para create jobList
    # 23 jobs
    para try ... check ... push ... etc ...
# Completed: 23 of 23 jobs
# CPU time in finished jobs:       9086s     151.43m     2.52h    0.11d  0.000 y
# IO & Wait Time:                   194s       3.24m     0.05h    0.00d  0.000 y
# Average job time:                 403s       6.72m     0.11h    0.00d
# Longest finished job:             707s      11.78m     0.20h    0.01d
# Submission to last job:           707s      11.78m     0.20h    0.01d

    #	combine results into a single file for loading and gbdb reference
    ssh kkstore05
    cd /cluster/data/gasAcu1/bed/multiz7way
    time catDir maf > multiz7way.maf
    #	real    0m6.467s
    #	makes an 1.5 Gb file:
    #	-rw-rw-r--  1 1577409197 Dec 22 14:49 multiz7way.maf

    #	Create per-chrom individual maf files for downloads
    ssh kkstore05
    cd /cluster/data/gasAcu1/bed/multiz7way
    mkdir mafDownloads
    time for M in maf/chr*.maf
    do
	B=`basename $M`
	cp -p ${M} mafDownloads/${B}
	gzip mafDownloads/${B}
	echo ${B} done
    done
    #	real    4m58.923s
    #	user    4m41.178s
    #	sys     0m14.387s

    #	deliver to downloads
    ssh hgwdev
    ln -s /cluster/data/gasAcu1/bed/multiz7way/mafDownloads \
	/usr/local/apache/htdocs/goldenPath/gasAcu1/multiz7way

    # Load into database
    ssh hgwdev
    cd /cluster/data/gasAcu1/bed/multiz7way
    mkdir /gbdb/gasAcu1/multiz7way
    ln -s /cluster/data/gasAcu1/bed/multiz7way/multiz7way.maf \
	/gbdb/gasAcu1/multiz7way
    time nice -n +19 hgLoadMaf gasAcu1 multiz7way
    #	Loaded 1925835 mafs in 1 files from /gbdb/gasAcu1/multiz7way
    #	real    0m45.970s
    #	user    0m25.438s
    #	sys     0m8.453s

    time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
	-maxSize=50000 gasAcu1 multiz7waySummary multiz7way.maf
    #	Created 684283 summary blocks from 5182575 components
    #		and 1925835 mafs from multiz7way.maf
    #	real    0m54.289s
    #	user    0m48.004s
    #	sys     0m1.779s

    #	Create tree image for details page
    #	You can get a better image from the phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    cp 7way.nh treeImage.nh
    #	Edit treeImage.nh to make the names as desired
    #	Submit this treeImage.nh to the phyloGif program, or:
    /cluster/bin/phast/draw_tree -b -s treeImage.nh > treeImage.ps

############################################################################
# ANNOTATE MULTIZ7WAY MAF AND LOAD TABLES (DONE - 2006-12-22 - Hiram)
##  replaced by 8-Way done below
    ssh kolossus
    mkdir /cluster/data/gasAcu1/bed/multiz7way/anno
    cd /cluster/data/gasAcu1/bed/multiz7way/anno
    mkdir maf run
    cd run
    rm -f sizes nBeds
    ln -s  /cluster/data/gasAcu1/chrom.sizes gasAcu1.len
    twoBitInfo -nBed /cluster/data/gasAcu1/gasAcu1.sdTrf.{2bit,N.bed}
    ln -s  /cluster/data/gasAcu1/gasAcu1.sdTrf.N.bed gasAcu1.bed

    for DB in `grep -v gasAcu1 /cluster/data/gasAcu1/bed/multiz7way/species.lst`
    do
        ln -s  /cluster/data/${DB}/chrom.sizes ${DB}.len
        ln -s  /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
	echo ${DB}.bed  >> nBeds
	echo ${DB}.len  >> sizes
	echo $DB
    done

    echo '#!/bin/csh -ef' > jobs.csh
    echo date >> jobs.csh
    # do smaller jobs first so you can see some progress immediately:
    for F in `ls -1rS ../../maf/*.maf`
    do
     echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $F \
      /cluster/data/gasAcu1/gasAcu1.sdTrf.2bit ../maf/`basename $F` >> jobs.csh
     echo "echo $F" >> jobs.csh
    done 
    echo date >> jobs.csh
    chmod +x jobs.csh
    time ./jobs.csh > jobs.log 2>&1 &
    #	to watch progress;
    tail -f jobs.log
    #	real    19m20.705s
    #	user    7m11.818s
    #	sys     11m31.508s

    # Load anno/maf
    ssh hgwdev
    cd /cluster/data/gasAcu1/bed/multiz7way/anno/maf
    mkdir -p /gbdb/gasAcu1/multiz7way/anno/maf
    ln -s /cluster/data/gasAcu1/bed/multiz7way/anno/maf/*.maf \
      /gbdb/gasAcu1/multiz7way/anno/maf
    time nice -n +19 hgLoadMaf \
	-pathPrefix=/gbdb/gasAcu1/multiz7way/anno/maf gasAcu1 multiz7way
    #	Loaded 2269386 mafs in 23 files from /gbdb/gasAcu1/multiz7way/anno/maf
    #	real    0m52.053s
    #	user    0m35.608s
    #	sys     0m3.762s

    # Do the computation-intensive part of hgLoadMafSummary on a workhorse 
    # machine and then load on hgwdev:
    ssh kolossus
    cd /cluster/data/gasAcu1/bed/multiz7way/anno/maf
    time cat *.maf | \
    	nice -n +19 hgLoadMafSummary gasAcu1 -minSize=30000 -mergeGap=1500 \
	-maxSize=200000 -test multiz7waySummary stdin

#######################################################################
## BLASTZ Stickleback chroms vs chicken chroms and random contigs
##	(DONE - 2006-12-28 - 2007-01-05 - Hiram)
    ssh kkstore05
    mkdir /cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28
    cd /cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28
    cat << '_EOF_' > DEF
# Stickleback chroms vs. Chicken random contigs

# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
    
# TARGET: Stickleback gasAcu1 no chrUn sequence, only chroms
#	The largest is 32 million bases, there are 22 of them
SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.sizes
SEQ1_CHUNK=40000000
SEQ1_LAP=10000

# QUERY: Chicken galGal3 chroms and randoms in contigs
#	The largest chrom is 200M bases, the largest contig 122,228
#	The smallest chrom is 1028 bases, smallest contig 54 bases
#	there are 25,991 chroms and contig pieces
#	total size 1,089,690,673 bases
#	47107538 N's 1042583135 real 834274605 upper 208308530 lower
SEQ2_DIR=/san/sanvol1/scratch/galGal3/galGal3.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/galGal3/galGal3.sdTrf.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/galGal3/galGal3.randomContigs.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/galGal3/galGal3.randomContigs.sdTrf.sizes
SEQ2_LIFT=/san/sanvol1/scratch/galGal3/randomContigs.lft
SEQ2_CHUNK=10000000
SEQ2_LIMIT=20
SEQ2_LAP=0

BASE=/cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 DEF \
	-smallClusterHub=pk \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-blastzOutRoot /cluster/bluearc/gasAcu1ChrGalGal3ChrUn \
	-bigClusterHub=pk > do.log 2>&1 &
    #	real    53m30.713s
    #	failed during the load step due to difficulties on hgwdev
    #	continuing 2007-01-02
    cd axtChain
    ./loadUp.csh
    cd ..
    time doBlastzChainNet.pl -verbose=2 DEF \
	-smallClusterHub=pk \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-blastzOutRoot /cluster/bluearc/gasAcu1ChrGalGal3ChrUn \
	-continue=download -bigClusterHub=pk > download.log 2>&1 &

    #	And, to swap:
    mkdir /cluster/data/galGal3/bed/blastz.gasAcu1.swap
    cd /cluster/data/galGal3/bed/blastz.gasAcu1.swap
    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/DEF \
	-smallClusterHub=pk \
        -chainMinScore=2000 -chainLinearGap=loose \
        -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
        -swap -bigClusterHub=pk > swap.log 2>&1 &
    #	failed on the net step:
# HgStepManager: executing step 'net'.
# netChains: looks like previous stage was not successful
#	(can't find [galGal3.gasAcu1.]all.chain[.gz]).
    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/DEF \
	-smallClusterHub=pk \
        -chainMinScore=2000 -chainLinearGap=loose \
        -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
        -continue=net -swap -bigClusterHub=pk > net_swap.log 2>&1 &
    #	real    7m11.286s

    featureBits gasAcu1 chainGalGal3Link > fb.gasAcu1.chainGalGal2Link.txt 
    #	37154459 bases of 446627861 (8.319%) in intersection
    featureBits galGal3 chainGasAcu1Link > fb.galGal3.chainGasAcu1Link.txt
    #	32781658 bases of 1042591351 (3.144%) in intersection

    #  Adding the chrUn result from the galGal3 swapped alignments
    ssh kkstore05
    cd /cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/axtChain
    chainSplit chain gasAcu1.galGal3.all.chain.gz
    cp -p ../../blastz.galGal3.swap/axtChain/chain/chrUn.chain chain/chrUn.chain
    ssh hgwdev
    cd /cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/axtChain/chain
    hgLoadChain gasAcu1 chrUn_chainGalGal3 chrUn.chain
    Loading 13904 chains into gasAcu1.chrUn_chainGalGal3

    ssh kkstore05
    cd /cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/axtChain/chain
    chainMergeSort *.chain | gzip -c > ../gasAcu1.galGal3.all.chain.gz
    #	This changes the IDs, but nothing else
    #	reusing the existing netChains with some alterations
    cp netChains.csh netChainsUn.csh
    #	fixup the reference to chrom.sizes and the 2bit file in this script
    #	and running it after moving existing items out of the way


#########################################################################
## 8-Way Multiz (DONE - 2007-01-02 - 2007-01-05 - Hiram)
##  re-done with new chrUn.maf 2007-01-13 - Hiram
##  re-done with fr2 instead of fr1 - 2007-02-02 - Hiram
    ssh hgwdev
    mkdir /cluster/data/gasAcu1/bed/multiz8way
    cd /cluster/data/gasAcu1/bed/multiz8way

    /cluster/bin/phast/tree_doctor --prune chimp_panTro2,macaque_rheMac2,rat_rn4,rabbit_oryCun1,cow_bosTau2,dog_canFam2,armadillo_dasNov1,elephant_loxAfr1,tenrec_echTel1,monodelphis_monDom4,xenopus_xenTro2 17way.nh

    #	use the output of that to manually construct this tree.
    #	Arbitrarily set 0.2 distances for this added branch
    #	All other distances remain as specified in the 17way.nh

    cat << '_EOF_' > 8way.nh
(((human_hg18:0.054922,mouse_mm8:0.412790):0.475049,
	chicken_galGal3:0.454691):0.279364,
  (((tetraodon_tetNig1:0.199381,fugu_fr2:0.239894):0.2,
    (stickleback_gasAcu1:0.2,medaka_oryLat1:0.2):0.2):0.292961,
	zebrafish_danRer4:0.782561):0.156067);
'_EOF_'
    # << happy emacs

    #	Use this specification in the phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    #	to obtain a gif image for htdocs/images/phylo/gasAcu1_8way.gif

    /cluster/bin/phast/all_dists 8way.nh > 8way.distances.txt
    #	Use this output to create the table below
    grep -y gasAcu1 8way.distances.txt | sort -k3,3n
#
#	If you can fill in all the numbers in this table, you are ready for
#	the multiple alignment procedure
#
#                         featureBits chainLink measures
#                                           chainGasAcu1Link  chain linearGap
#    distance                      on gasAcu1    on other   minScore
#  1  0.4000 - medaka oryLat1      (% 38.308) (% 27.044)       2000     loose
#  2  0.7994 - tetraodon tetNig1   (% 42.044) (% 50.004)       5000     loose
#  3  0.8399 - fugu fr2            (% 37.574) (% 40.269)       2000     loose
#  3  0.8399 - fugu fr1            (% 35.925) (% 46.909)       2000     loose
#  4  1.4755 - zebrafish danRer4   (% 35.163) (% 13.104)       5000     loose
#  6  1.5831 - chicken galGal3     (%  9.181) (%  3.144)       2000     loose
#  7  1.6584 - human hg18          (% 11.353) (%  1.923)       2000     loose
#  8  2.0162 - mouse mm8           (% 11.070) (%  2.056)       2000     loose

    cd /cluster/data/gasAcu1/bed/multiz8way
    #	bash shell syntax here ...
    export H=/cluster/data/gasAcu1/bed
    mkdir mafLinks
    for G in oryLat1 tetNig1 fr2 danRer4 galGal3 hg18 mm8
    do
	mkdir mafLinks/$G
	if [ ! -d ${H}/blastz.${G}/mafNet ]; then
	echo "missing directory blastz.${G}/mafNet"
		exit 255
	fi
	ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
    done

    #	Copy MAFs to some appropriate NFS server for kluster run
    ssh kkstore05
    mkdir /san/sanvol1/scratch/gasAcu1/multiz8way
    cd /san/sanvol1/scratch/gasAcu1/multiz8way
    time rsync -a --copy-links --progress \
	/cluster/data/gasAcu1/bed/multiz8way/mafLinks/ .

    mkdir penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn

    # the autoMultiz cluster run
    ssh pk
    cd /cluster/data/gasAcu1/bed/multiz8way/

    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	8way.nh > tmp.nh
    echo `cat tmp.nh` > tree-commas.nh
    echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
    sed 's/[()]//g; s/,/ /g' tree.nh > species.lst

    mkdir run maf
    cd run

    #	NOTE: you need to set the db and multiz dirname properly in this script
    cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = gasAcu1
set c = $1
set maf = $2
set binDir = /san/sanvol1/scratch/$db/multiz8way/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/multiz8way
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
    set in = $pairs/$s/$c.maf
    set out = $db.$s.sing.maf
    if ($s == $db) then
	continue
    endif
    if (-e $in.gz) then
	zcat $in.gz > $out
    else if (-e $in) then
	cp $in $out
    else
	echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod +x autoMultiz

cat  << '_EOF_' > template
#LOOP
autoMultiz $(root1) {check out line+ /cluster/data/gasAcu1/bed/multiz8way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << happy emacs

    awk '{print $1}' /cluster/data/gasAcu1/chrom.sizes > chrom.lst
    gensub2 chrom.lst single template jobList
    para create jobList
    # 23 jobs
    para try ... check ... push ... etc ...
# Completed: 23 of 23 jobs
# CPU time in finished jobs:      11075s     184.58m     3.08h    0.13d  0.000 y
# IO & Wait Time:                   155s       2.58m     0.04h    0.00d  0.000 y
# Average job time:                 488s       8.14m     0.14h    0.01d
# Longest finished job:            1123s      18.72m     0.31h    0.01d
# Submission to last job:          1123s      18.72m     0.31h    0.01d

    #	combine results into a single file for loading and gbdb reference
    ssh kkstore05
    cd /cluster/data/gasAcu1/bed/multiz8way
    time nice -n +19 catDir maf > multiz8way.maf
    #	real    0m7.656s
    #	makes an 1.7 Gb file:
    #	-rw-rw-r--  1 1742307109 Feb  2 16:58 multiz8way.maf

    #	Create per-chrom individual maf files for downloads
    #	NOT NECESSARY HERE - DONE LATER WITH THE ANNOTATED MAFS
    ssh kkstore05
    cd /cluster/data/gasAcu1/bed/multiz8way
    mkdir mafDownloads
    time for M in maf/chr*.maf
    do
	B=`basename $M`
	cp -p ${M} mafDownloads/${B}
	gzip mafDownloads/${B}
	echo ${B} done
    done
    #	real    5m9.273

    #	deliver to downloads *!* NOT NECESSARY HERE - DONE LATER WITH
    #		THE ANNOTATED MAFS
    ssh hgwdev
    ln -s /cluster/data/gasAcu1/bed/multiz8way/mafDownloads \
	/usr/local/apache/htdocs/goldenPath/gasAcu1/multiz8way

    # Load into database
    ssh hgwdev
    cd /cluster/data/gasAcu1/bed/multiz8way
    mkdir /gbdb/gasAcu1/multiz8way
    ln -s /cluster/data/gasAcu1/bed/multiz8way/multiz8way.maf \
	/gbdb/gasAcu1/multiz8way
    time nice -n +19 hgLoadMaf gasAcu1 multiz8way
    #	Loaded 1989046 mafs in 1 files from /gbdb/gasAcu1/multiz8way
    #	real    0m47.121s

    time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
	-maxSize=50000 gasAcu1 multiz8waySummary multiz8way.maf
    #	Created 775687 summary blocks from 6070904 components
    #		and 2047322 mafs from multiz8way.maf
    #	real    1m0.331s

    #	Create tree image for details page
    #	You can get a better image from the phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    # cp 8way.nh treeImage.nh
    #	Edit treeImage.nh to make the names as desired
    #	Submit this treeImage.nh to the phyloGif program, or:
    # /cluster/bin/phast/draw_tree -b -s treeImage.nh > treeImage.ps

############################################################################
# ANNOTATE MULTIZ8WAY MAF AND LOAD TABLES (DONE - 2007-01-05 - Hiram)
##  re-done with new chrUn.maf 2007-01-13 - Hiram
##  re-done with fr2 instead of fr1 - 2007-02-02 - Hiram
##  re-done with correct nBeds and sizes files, 2007-03-27 - Hiram
    ssh kolossus
    mkdir /cluster/data/gasAcu1/bed/multiz8way/anno
    cd /cluster/data/gasAcu1/bed/multiz8way/anno
    mkdir maf run
    cd run
    rm -f sizes nBeds
    ln -s  /cluster/data/gasAcu1/chrom.sizes gasAcu1.len
    #	This were done before during the 7-way
    # twoBitInfo -nBed /cluster/data/gasAcu1/gasAcu1.sdTrf.{2bit,N.bed}
    ln -s  /cluster/data/gasAcu1/gasAcu1.sdTrf.N.bed gasAcu1.bed
    for DB in \
`cat /cluster/data/gasAcu1/bed/multiz8way/species.lst | sed -e "s/ gasAcu1//"`
    do
        ln -s  /cluster/data/${DB}/chrom.sizes ${DB}.len
        ln -s  /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
	echo ${DB}.bed  >> nBeds
	echo ${DB}.len  >> sizes
	echo $DB
    done

    echo '#!/bin/csh -ef' > jobs.csh
    echo date >> jobs.csh
    # do smaller jobs first so you can see some progress immediately:
    for F in `ls -1rS ../../maf/*.maf`
    do
     echo mafAddIRows -nBeds=nBeds -sizes=sizes $F \
      /cluster/data/gasAcu1/gasAcu1.sdTrf.2bit ../maf/`basename $F` >> jobs.csh
     echo "echo $F" >> jobs.csh
    done 
    echo date >> jobs.csh
    chmod +x jobs.csh
    time nice -n +19 ./jobs.csh > jobs.log 2>&1 &
    #	to watch progress;
    tail -f jobs.log
    #	this was run on titan (can not do this for larger genomes on titan)
    #	real    48m40.965s
    # Load anno/maf
    ssh hgwdev
    cd /cluster/data/gasAcu1/bed/multiz8way/anno/maf
    mkdir -p /gbdb/gasAcu1/multiz8way/anno/maf
    ln -s /cluster/data/gasAcu1/bed/multiz8way/anno/maf/*.maf \
      /gbdb/gasAcu1/multiz8way/anno/maf
    time nice -n +19 hgLoadMaf \
	-pathPrefix=/gbdb/gasAcu1/multiz8way/anno/maf gasAcu1 multiz8way
    #	Loaded 2395584 mafs in 23 files from /gbdb/gasAcu1/multiz8way/anno/maf
    #	real    2m16.918s

    # Do the computation-intensive part of hgLoadMafSummary on a workhorse 
    # machine and then load on hgwdev:
    ssh kolossus
    cd /cluster/data/gasAcu1/bed/multiz8way/anno/maf
    time cat *.maf | \
    	nice -n +19 hgLoadMafSummary gasAcu1 -minSize=30000 -mergeGap=1500 \
	-maxSize=200000 -test multiz8waySummary stdin
    #	Created 322344 summary blocks from 6111763 components
    #	and 2395584 mafs from stdin
    #	real    23m49.073s
    #	-rw-rw-r--  1  15063781 Mar 27 12:02 multiz8waySummary.tab

    ssh hgwdev
    cd /cluster/data/gasAcu1/bed/multiz8way/anno/maf
    sed -e 's/mafSummary/multiz8waySummary/' ~/kent/src/hg/lib/mafSummary.sql \
      > /tmp/multiz8waySummary.sql
    time nice -n +19 hgLoadSqlTab gasAcu1 multiz8waySummary \
	~/kent/src/hg/lib/mafSummary.sql multiz8waySummary.tab
    #	real    0m4.525

#########################################################################
# Adding automatic generation of upstream files (DONE - 2009-08-13 - Hiram)
    # edit src/hg/makeDb/genbank/genbank.conf to add:
gasAcu1.upstreamGeneTbl = ensGene
gasAcu1.upstreamMaf = multiz8way /hive/data/genomes/gasAcu1/bed/multiz8way/species.lst

#########################################################################
# MULTIZ8WAY DOWNLOADABLES (DONE - 2007-01-05 - Hiram)
##  re-done with new chrUn.maf 2007-01-13 - Hiram
##  re-done with fr2 in place of fr1 - 2007-02-03 - Hiram
    # Annotated MAF is now documented, so use anno/maf for downloads/
    ssh hgwdev
    mkdir /cluster/data/gasAcu1/bed/multiz8way/mafDownloads
    cd /cluster/data/gasAcu1/bed/multiz8way

    # upstream mafs 
    #	rebuilt 2007-12-21 to fix difficulty in mafFrags when species.lst
    #	did not have mm8 as the first one
    #	There isn't any refGene table, using ensGene instead
for S in 1000 2000 5000
do
    echo "making upstream${S}.maf"
    nice -n +19 $HOME/bin/$MACHTYPE/featureBits -verbose=2 gasAcu1 \
	ensGene:upstream:${S} -fa=/dev/null -bed=stdout \
    	| perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
	| $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags gasAcu1 multiz8way \
		stdin stdout -orgs=species.lst \
	| gzip -c > mafDownloads/ensGene.upstream${S}.maf.gz
    echo "done ensGene.upstream${S}.maf.gz"
done

    ## re-done after correction to anno/maf files 2007-03-27 - Hiram
    ssh kkstore05
    cd /cluster/data/gasAcu1/bed/multiz8way
    time for M in anno/maf/chr*.maf
    do
	B=`basename $M`
	nice -n +19 gzip -c ${M} > mafDownloads/${B}.gz
	echo ${B}.gz done
    done
    #	real    6m1.454

    cd mafDownloads
    nice -n +19 md5sum *.maf.gz > md5sum.txt
    # Make a README.txt

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/gasAcu1/multiz8way
    cd /usr/local/apache/htdocs/goldenPath/gasAcu1/multiz8way
    ln -s /cluster/data/gasAcu1/bed/multiz8way/mafDownloads/{*.gz,*.txt} .

#######################################################################
# MULTIZ8WAY MAF FRAMES (DONE - 2007-01-10 - Hiram)
##  re-done with new chrUn.maf 2007-01-13 - Hiram
##  re-donw with fr2 in place of fr1 - 2007-02-03 - Hiram
    ssh hgwdev
    mkdir /cluster/data/gasAcu1/bed/multiz8way/frames
    cd /cluster/data/gasAcu1/bed/multiz8way/frames
    mkdir genes
    # The following is adapted from the mm8 sequence

    #------------------------------------------------------------------------
    # get the genes for all genomes
    # mRNAs with CDS.  single select to get cds+psl, then split that up and
    # create genePred
    # using ensGene for gasAcu1 oryLat1
    for qDB in danRer4 galGal3
    do
	geneTbl=refGene
      echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB}
      hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-100 \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/$qDB.tmp.gz
      mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
      rm -f $tmpExt
    done

    # using knownGene for mm8 hg18
    # using genscan for tetNig1
    # using ensGene for gasAcu1, oryLat1 and fr2
    # genePreds; (must keep only the first 10 columns for knownGene)
    for qDB in mm8 hg18 gasAcu1 oryLat1 fr2 tetNig1
    do
      if [ $qDB = "gasAcu1" -o $qDB = "oryLat1" -o $qDB = "fr2" ]; then
        geneTbl=ensGene
      elif [ $qDB = "tetNig1" ]; then
        geneTbl=genscan
      else
        geneTbl=knownGene
      fi
      echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB}
      hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 1-10 \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/$qDB.tmp.gz
      mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
      rm -f $tmpExt
    done

    ###
    
    ssh kkstore05
    cd /cluster/data/gasAcu1/bed/multiz8way/frames
    time cat ../maf/*.maf | nice -n +19 genePredToMafFrames gasAcu1 stdin stdout gasAcu1 genes/gasAcu1.gp.gz oryLat1 genes/oryLat1.gp.gz fr2 genes/fr2.gp.gz tetNig1 genes/tetNig1.gp.gz danRer4 genes/danRer4.gp.gz galGal3 genes/galGal3.gp.gz mm8 genes/mm8.gp.gz hg18 genes/hg18.gp.gz | gzip > multiz8way.mafFrames.gz
    #	real    0m55.706

    ssh hgwdev
    cd /cluster/data/gasAcu1/bed/multiz8way/frames
    time nice -n +19 hgLoadMafFrames gasAcu1 multiz8wayFrames \
	multiz8way.mafFrames.gz
    #	real    0m32.220s

############################################################################
# CREATE CONSERVATION WIGGLE WITH PHASTCONS
#		(DONE - 2007-01-10 - 2007-01-17 - Hiram)
## re-done with fr2 in place of fr1 - 2007-02-03 - Hiram

# Estimate phastCons parameters
    ssh kkstore05
    mkdir /cluster/data/gasAcu1/bed/multiz8way/cons
    cd /cluster/data/gasAcu1/bed/multiz8way/cons

    twoBitToFa -seq=chrIV ../../../gasAcu1.sdTrf.2bit chrIV.fa
    # Create a starting-tree.mod based on chrIV (the largest one)
    time nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chrIV.maf \
	--refseq ./chrIV.fa --in-format MAF \
	--windows 100000000,1000 --out-format SS \
	--between-blocks 5000 --out-root s1
    #	26 seconds

    time nice -n +19 /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
--tree "(((hg18,mm8),galGal3),(((tetNig1,fr2),(gasAcu1,oryLat1)),danRer4))" \
    --out-root starting-tree
    #	Fitting tree model to s1.1-32632948.ss using REV ...
    #	Writing model to starting-tree.mod ...
    #	real    3m53.084

    rm s1.*.ss
    # add up the C and G:
    grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
    #	0.449
    #	This 0.449 is used in the --gc argument below

    ## the fa files are needed for the sequence and they are created during
    #	this loop if they haven't been done before
    # Create SS files on san filesystem
    #	Using a size of 10,000,000 to slow down the phastCons pk jobs
    ssh kkstore05
    mkdir -p  /san/sanvol1/scratch/gasAcu1/cons/ss
    cd  /san/sanvol1/scratch/gasAcu1/cons/ss
    time for C in `awk '{print $1}' /cluster/data/gasAcu1/chrom.sizes`
    do
      if [ -s /cluster/data/gasAcu1/bed/multiz8way/maf/${C}.maf ]; then
	mkdir -p ${C}
	mkdir -p /cluster/data/gasAcu1/bed/multiz8way/cons/${C}
	echo msa_split $C
	if [ ! -s /cluster/data/gasAcu1/bed/multiz8way/cons/${C}/${C}.fa ]; then
	    twoBitToFa -seq=${C} /cluster/data/gasAcu1/gasAcu1.sdTrf.2bit \
		/cluster/data/gasAcu1/bed/multiz8way/cons/${C}/${C}.fa
	fi
	chrN=${C/chr/}
	chrN=${chrN/_random/}
	nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split \
	    /cluster/data/gasAcu1/bed/multiz8way/maf/${C}.maf \
	    --refseq /cluster/data/gasAcu1/bed/multiz8way/cons/${C}/${C}.fa \
	    --in-format MAF --windows 10000000,0 --between-blocks 5000 \
	    --out-format SS --out-root ${C}/${C}
      fi
    done &
    #	real    6m31.721

    # Create a random list of 50 1 mb regions  (do not use the _randoms)
    cd /san/sanvol1/scratch/gasAcu1/cons/ss
    ls -1l chr*/chr*.ss | grep -v Un | \
	awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list

    # Set up parasol directory to calculate trees on these 50 regions
    ssh pk
    mkdir /san/sanvol1/scratch/gasAcu1/cons/treeRun6
    cd /san/sanvol1/scratch/gasAcu1/cons/treeRun6
    mkdir tree log

    #	Tuning this loop should come back to here to recalculate 
    # Create little script that calls phastCons with right arguments
    #	--target-coverage of 0.20 is about right for mouse, will be
    #	tuned exactly below
    cat > makeTree.csh << '_EOF_'
#!/bin/csh -fe
set C=$1:h
mkdir -p log/${C} tree/${C}
    /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \
      /cluster/data/gasAcu1/bed/multiz8way/cons/starting-tree.mod \
      --gc 0.449 --nrates 1,1 --no-post-probs --ignore-missing \
      --expected-length 10 --target-coverage 0.20 \
      --quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
    #	<< happy emacs
    chmod a+x makeTree.csh

    # Create gensub file
    cat > template << '_EOF_'
#LOOP
./makeTree.csh $(path1)
#ENDLOOP
'_EOF_'
    #	<< happy emacs

    # Make cluster job and run it
    gensub2 ../randomSs.list single template jobList
    para create jobList
    para try/push/check/etc
# Completed: 47 of 47 jobs
# CPU time in finished jobs:      78013s    1300.22m    21.67h    0.90d  0.002 y
# IO & Wait Time:                   224s       3.73m     0.06h    0.00d  0.000 y
# Average job time:                1665s      27.74m     0.46h    0.02d
# Longest finished job:            2123s      35.38m     0.59h    0.02d
# Submission to last job:          2123s      35.38m     0.59h    0.02d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh pk
    cd /san/sanvol1/scratch/gasAcu1/cons/treeRun6
    ls -1 tree/chr*/*.cons.mod > cons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
	--output-average ave.cons.mod > cons_summary.txt 2>&1 &
    ls -1 tree/chr*/*.noncons.mod > noncons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
	--output-average ave.noncons.mod > noncons_summary.txt
    cp -p ave.*.mod ..
    cd ..
    cp -p ave.*.mod /cluster/data/gasAcu1/bed/multiz8way/cons

    #	measuring entropy
    #	consEntopy <target coverage> <expected lengths>
    #		 ave.cons.mod ave.noncons.mod --NH 9.78
    #	never stops with the --NH argument
    time /cluster/bin/phast/$MACHTYPE/consEntropy --NH 9.7834 \
	0.20 10 ave.{cons,noncons}.mod
## 0.20 10 with fr2
( Solving for new omega: 10.000000 12.945684 12.467518 12.456325 )

Transition parameters: gamma=0.200000, omega=10.000000, mu=0.100000, nu=0.025000
Relative entropy: H=1.364850 bits/site
Expected min. length: L_min=6.767655 sites
Expected max. length: L_max=4.721873 sites
Phylogenetic information threshold: PIT=L_min*H=9.236837 bits
Recommended expected length: omega=12.456325 sites (for L_min*H=9.783400)

## 0.20 10 
( Solving for new omega: 10.000000 12.922391 12.450309 12.439360 )

Transition parameters: gamma=0.200000, omega=10.000000, mu=0.100000, nu=0.025000
Relative entropy: H=1.358227 bits/site
Expected min. length: L_min=6.803722 sites
Expected max. length: L_max=4.741215 sites
Phylogenetic information threshold: PIT=L_min*H=9.241002 bits
Recommended expected length: omega=12.439360 sites (for L_min*H=9.783400)

## 0.01 12 does not convert, trying without --NH
Transition parameters: gamma=0.010000, omega=12.000000, mu=0.083333, nu=0.000842
Relative entropy: H=1.012920 bits/site
Expected min. length: L_min=15.386530 sites
Expected max. length: L_max=10.103812 sites
Phylogenetic information threshold: PIT=L_min*H=15.585327 bits

## 0.10 12
# ( Solving for new omega: 12.000000 7.303911 5.603783 4.937054 4.700330 4.651391 4.648962 4.648956 )

# Transition parameters: gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259
# Relative entropy: H=1.200128 bits/site
# Expected min. length: L_min=9.375672 sites
# Expected max. length: L_max=6.463613 sites
# Phylogenetic information threshold: PIT=L_min*H=11.252009 bits
# Recommended expected length: omega=4.648956 sites (for L_min*H=9.783400)

## 0.17 12
# ( Solving for new omega: 12.000000 10.568744 10.445646 10.444628 )

# Trans parameters: gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068
# Relative entropy: H=1.267443 bits/site
# Expected min. length: L_min=7.976979 sites
# Expected max. length: L_max=5.664374 sites
# Phylogenetic information threshold: PIT=L_min*H=10.110368 bits
# Recommended expected length: omega=10.444628 sites (for L_min*H=9.783400)

    /cluster/bin/phast/$MACHTYPE/consEntropy .17 12 \
                        ave.cons.mod ave.noncons.mod
Transition parameters: gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068
Relative entropy: H=1.253111 bits/site
Expected min. length: L_min=8.076184 sites
Expected max. length: L_max=5.736142 sites
Phylogenetic information threshold: PIT=L_min*H=10.120357 bits

#Transition parameters:gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259
# Relative entropy: H=1.454874 bits/site
# Required length: N=7.596943 sites
# Total entropy: NH=11.052595 bits

    #	SKIP to here passing by the tuning numbers
    ssh pk
    # Create cluster dir to do main phastCons run
    mkdir /san/sanvol1/scratch/gasAcu1/cons/consRun5
    cd /san/sanvol1/scratch/gasAcu1/cons/consRun5
    mkdir ppRaw bed

    # Create script to run phastCons with right parameters
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    cat > doPhast << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ../ave.{cons,noncons}.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
   --expected-length 10 --target-coverage 0.20 --quiet \
	--seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/ave.{cons,noncons}.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
    # << happy emacs
    chmod a+x doPhast

    #	root1 == chrom name, file1 == ss file name without .ss suffix
    # Create gsub file
    cat > template << '_EOF_'
#LOOP
./doPhast $(root1) $(file1)
#ENDLOOP
'_EOF_'
    #	<< happy emacs

    # Create parasol batch and run it
    ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list

    gensub2 in.list single template jobList
    para create jobList
    para try/check/push/etc.
    #	These jobs are very fast and very I/O intensive, even on the san
    #	they will hang it up as they work at full tilt.
# Completed: 58 of 58 jobs
# CPU time in finished jobs:       1739s      28.99m     0.48h    0.02d  0.000 y
# IO & Wait Time:                   302s       5.03m     0.08h    0.00d  0.000 y
# Average job time:                  35s       0.59m     0.01h    0.00d
# Longest finished job:              47s       0.78m     0.01h    0.00d
# Submission to last job:            77s       1.28m     0.02h    0.00d

    # combine predictions and transform scores to be in 0-1000 interval
    #	it uses a lot of memory, so on kolossus:
    ssh kolossus
    cd /san/sanvol1/scratch/gasAcu1/cons/consRun5
    #	The sed's and the sort get the file names in chrom,start order
    #	You might like to verify it is correct by first looking at the
    #	list it produces:
    find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | less
    #	if that looks right, then let it run:
    find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
	| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
	| /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved8Way.bed
    #	~ 16 seconds
    cp -p mostConserved8Way.bed /cluster/data/gasAcu1/bed/multiz8way

    # Figure out how much is actually covered by the bed file as so:
    #	Get the non-n genome size from faSize on all chroms:
    ssh kkstore05
    cd /cluster/data/gasAcu1/bed/multiz8way/cons
    faSize chr*/*.fa
    #	463354448 bases (16726587 N's 446627861 real 348847599
    #	upper 97780262 lower) in 23 sequences in 23 files

    cd /san/sanvol1/scratch/gasAcu1/cons/consRun5
    #	The 446627861 comes from the non-n genome as counted above.
    awk '
{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/446627861\n",100.0*sum/446627861,sum}' \
	mostConserved8Way.bed
    #	% 11.58 = 100.0*51703035/446627861 --exp-len 10 --tar-cov 0.20 fr2
    #	% 11.49 = 100.0*51326297/446627861 --exp-len 10 --tar-cov 0.20
    #	%  8.36 = 100.0*37347218/446627861 --exp-len 12 --tar-cov 0.01
    #	% 10.24 = 100.0*45733412/446627861 --exp-len 12 --tar-cov 0.10
    #	% 11.45 = 100.0*51136218/446627861 --exp-len 12 --tar-cov 0.17

    #	Aiming for %70 coverage in
    #	the following featureBits measurement on CDS:
    # Beware of negative scores when too high.  The logToBedScore
    # will output an error on any negative scores.

    HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits gasAcu1 \
	-enrichment ensGene:cds mostConserved8Way.bed
    #	--expected-length 10 --target-coverage 0.20 fr2
    #	ensGene:cds 6.618%, mostConserved8Way.bed 11.576%, both 4.070%, cover
    #	61.49%, enrich 5.31x
    #	--expected-length 10 --target-coverage 0.20
    #	ensGene:cds 6.618%, mostConserved8Way.bed 11.492%, both 4.041%, cover
    #	61.07%, enrich 5.31x
    #	--expected-length 12 --target-coverage 0.01
    #	ensGene:cds 6.618%, mostConserved8Way.bed 8.362%, both 3.628%, cover
    #	54.82%, enrich 6.56x
    #	--expected-length 12 --target-coverage 0.10
    #	ensGene:cds 6.618%, mostConserved8Way.bed 10.240%, both 3.920%, cover
    #	59.24%, enrich 5.79x
    #	--expected-length 12 --target-coverage 0.17
    #	ensGene:cds 6.618%, mostConserved8Way.bed 11.449%, both 4.092%, cover
    #	61.84%, enrich 5.40x

    # Load most conserved track into database
    ssh hgwdev
    cd /cluster/data/gasAcu1/bed/multiz8way
    # ended up using the set: --expected-length 10 --target-coverage 0.20
    time nice -n +19 hgLoadBed -strict gasAcu1 phastConsElements8way \
	mostConserved8Way.bed
    #	Loaded 1263039 elements of size 5
    #	real    0m29.697

    #	should measure the same as above
    time nice -n +19 \
	featureBits gasAcu1 -enrichment ensGene:cds phastConsElements8way
    #	At: --expected-length 10 --target-coverage 0.20 fr2
    #	ensGene:cds 6.618%, phastConsElements8way 11.576%, both 4.070%, cover
    #	61.49%, enrich 5.31x
    #	At: --expected-length 10 --target-coverage 0.20
    #	ensGene:cds 6.618%, phastConsElements8way 11.492%, both 4.041%, cover
    #	61.07%, enrich 5.31x
    #	At: --expected-length 12 --target-coverage 0.10 result:
    #	ensGene:cds 6.618%, phastConsElements8way 10.240%, both 3.920%, cover
    #	59.24%, enrich 5.79x
    #	At: --expected-length 12 --target-coverage 0.17 result:
    #	ensGene:cds 6.618%, phastConsElements8way 11.449%,
    #	both 4.092%, cover 61.84%, enrich 5.40x

    # Create merged posterier probability file and wiggle track data files
    ssh pk
    cd /san/sanvol1/scratch/gasAcu1/cons/consRun5
    # the sed business gets the names sorted by chromName, chromStart
    #	so that everything goes in numerical order into wigEncode
    #	This was verified above to be correct
    time nice -n +19 find ./ppRaw -type f \
	| sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
	    | $HOME/bin/$MACHTYPE/wigEncode -noOverlap stdin \
		phastCons8.wig phastCons8.wib
    #	Converted stdin, upper limit 1.00, lower limit 0.00
    #	real    3m55.780
    #	-rw-rw-r--   1 297832681 Feb  3 14:50 phastCons8.wib
    #	-rw-rw-r--   1  49168315 Feb  3 14:50 phastCons8.wig
    time nice -n +19 cp -p phastCons8.wi? /cluster/data/gasAcu1/bed/multiz8way/

    #	prepare compressed copy of ascii data values for downloads
    ssh pk
    cd /san/sanvol1/scratch/gasAcu1/cons/consRun5
    cat << '_EOF_' > gzipAscii.sh
#!/bin/sh

TOP=`pwd`
export TOP

mkdir -p phastCons8Scores

for D in ppRaw/chr*
do
    C=${D/ppRaw\/}
    out=phastCons8Scores/${C}.data.gz
    echo "========================== ${C} ${D}"
    find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
	    gzip > ${out}
done
'_EOF_'
    #	<< happy emacs
    chmod +x gzipAscii.sh
    time nice -n +19 ./gzipAscii.sh
    #	real    5m31.855s

    #	copy them for downloads
    ssh kkstore05
    #	this directory is actually a symlink from store9 to store8 to
    #	avoid the data full problem on store9
    mkdir /cluster/data/gasAcu1/bed/multiz8way/phastCons8Scores
    cd /cluster/data/gasAcu1/bed/multiz8way/phastCons8Scores
    cp -p  /san/sanvol1/scratch/gasAcu1/cons/consRun5/phastCons8Scores/* .
    # make a README.txt file here, and an md5sum

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/gasAcu1/phastCons8Scores
    cd /usr/local/apache/htdocs/goldenPath/gasAcu1/phastCons8Scores
    ln -s /cluster/data/gasAcu1/bed/multiz8way/phastCons8Scores/* .

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/gasAcu1/bed/multiz8way
    ln -s `pwd`/phastCons8.wib /gbdb/gasAcu1/wib/phastCons8.wib
    # ended up using the set: --expected-length 10 --target-coverage 0.20
    time nice -n +19 hgLoadWiggle gasAcu1 phastCons8 phastCons8.wig
    #	real    0m9.256s

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/gasAcu1/bed/multiz8way
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=gasAcu1 phastCons8 > histogram.data 2>&1
    #	real    0m30.744

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
        xffffff x000000 x000000 x444444 xaa4400 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Stickleback gasAcu1 Histogram phastCons8 track"
set xlabel " phastCons8 score - --expected-length 10 --target-coverage 0.20"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

    #	The mostConserved can also be characterized by a histogram
    awk '{print $3-$2}' mostConserved8Way.bed > mostCons.txt
    textHistogram -verbose=2 -autoScale=1000 -pValues mostCons.txt \
	> mostCons.histo.txt

    cat << '_EOF_' | gnuplot > mostCons.png
set terminal png small color \
        xffffff x000000 x000000 x444444 xaa4400 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Stickleback gasAcu1 histogram: lengths of mostConserved track elements"
set xlabel " mostConserved element length - --expected-length 10 --target-coverage 0.20"
set ylabel " # of elements at this length"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set xrange [0:200]
set y2tics
set boxwidth 2
set style fill solid

plot "mostCons.histo.txt" using 2:3 title " # of elements" with boxes, \
        "mostCons.histo.txt" using 2:6 axes x1y2 title " CRF" with lines
'_EOF_'
#	<< happy emacs

##############################################################################
## Medaka oryLat1 swap (DONE - 2007-01-12 - Hiram)
    ## And the swap
    ssh kkstore05
    mkdir /cluster/data/gasAcu1/bed/blastz.orylat1.swap
    cd /cluster/data/gasAcu1/bed/blastz.orylat1.swap
    time doBlastzChainNet.pl -verbose=2 -smallClusterHub=pk \
	/cluster/data/oryLat1/bed/blastz.gasAcu1.2007-01-11/DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -stop=net -bigClusterHub=pk > swap.log 2>&1 &

    time doBlastzChainNet.pl -verbose=2 -smallClusterHub=pk \
	/cluster/data/oryLat1/bed/blastz.gasAcu1.2007-01-11/DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-continue=net -swap -stop=net -bigClusterHub=pk > net-swap.log 2>&1 &
    #	real    18m21.951s

    ## Taking this chrUn chain result over to the other global result
    ssh kkstore05
    #	Note: previous result files temporarily moved out of the way
    cd /cluster/data/gasAcu1/bed/blastz.oryLat1.2006-12-22/axtChain
    chainSplit chain gasAcu1.oryLat1.all.chain.gz
    cp -p ../../blastz.orylat1.swap/axtChain/chain/chrUn.chain ./chain
    cd chain
    chainMergeSort *.chain | gzip -c > ../gasAcu1.oryLat1.all.chain.gz
    cd ..
    rm -fr chain
    #	they have new IDs
    chainSplit chain gasAcu1.oryLat1.all.chain.gz
    #	And then reusing the netChains.csh script, fixing up references
    #	to the appropriate chrom.sizes and 2bit file for gasAcu1
    cp netChains.csh netChainsUn.csh
    time ./netChainsUn.csh
    #	real    30m9.385s

    ## reloading, reuse the load.csh script, fixup reference to correct
    ##	chrom.sizes
    ssh hgwdev
    cd /cluster/data/gasAcu1/bed/blastz.oryLat1.2006-12-22/axtChain

###########################################################################
## SWAP FR1 BLASTZ (DONE - 2007-01-13 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/gasAcu1/bed/blastz.fr1.swap
    cd /cluster/data/gasAcu1/bed/blastz.fr1.swap
    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/fr1/bed/blastz.gasAcu1.2007-01-11/DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -stop=net -bigClusterHub=pk > swap.log 2>&1 &
    #	real    130m31.028s

    ## Take that chrUn result into the primary fr1 blastz result
    cd /cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08/axtChain
    chainSplit chain gasAcu1.fr1.all.chain.gz
    cd chain
    cp -p /cluster/data/gasAcu1/bed/blastz.fr1.swap/axtChain/chain/chrUn.chain .
    chainMergeSort *.chain | gzip -c > ../gasAcu1.fr1.all.chain.gz
    cd ..
    chainSplit chain gasAcu1.fr1.all.chain.gz
    #	And then reusing the netChains.csh script, fixing up references
    #	to the appropriate chrom.sizes and 2bit file for gasAcu1
    ssh hgwdev
    cd /cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08/axtChain
    cp netChains.csh netChainsUn.csh
    time ./netChainsUn.csh
    #	real    122m36.719s
    #  And then reloading these, reusing the loadUp.csh script, fixup
    #	reference to chrom.sizes
    cp loadUp.csh reLoad.csh
    time ./reLoad.csh > reLoad.out 2>&1
    #	real    3m38.799s
    #	After moving previous stuff out of the way
    ./makeMd5sum.csh
    ./installDownloads.csh

#########################################################################
# BLASTZ/CHAIN/NET FR2 (DONE - 2007-01-23 - Hiram)
##  no chrUn in gasAcu1, and align to fr2 scaffolds,
##	results lifted to fr2 chrUn coordinates
    ssh kkstore05
    mkdir /cluster/data/gasAcu1/bed/blastz.fr2.2007-01-23
    cd /cluster/data/gasAcu1/bed/blastz.fr2.2007-01-23
    cat << '_EOF_' > DEF
# Stickleback vs. Fugu

# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Stippleback gasAcu1, no chrUn in this run
SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.sizes
SEQ1_CHUNK=40000000
SEQ1_LIMIT=30
SEQ1_LAP=0

# QUERY: Fugu fr2
#       Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit
SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/cluster/data/gasAcu1/bed/blastz.fr2.2007-01-23
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-verbose=2 -bigClusterHub=pk \
	-blastzOutRoot /cluster/bluearc/gasAcu1Fr2 > do.log 2>&1 &
    #	real    27m33.395s
    #	user    0m0.189s
    #	sys     0m0.163s
    # Error because of inconsistent chrom sizes. We had attempted to modify
    # our lift file to remove a gap inserted at the end of the entire
    # sequence, which we should not have done. 
    time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
        -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
        -verbose=2 -continue=chainRun -bigClusterHub=pk \
        -blastzOutRoot /cluster/bluearc/gasAcu1Fr2 > chainRun.log 2>&1 &

######################################################
## Now that it's finished, let's see how it did.  This already
## set up chain and net tracks for fr2 on gasAcu1, though a modification
## to ~/kent/src/hg/makeDb/trackDb/trackDb.ra needed to be done to make
## sure fr2 got displayed in the browser.
##
## track chainFr2
## shortLabel $o_db Chain
## longLabel $o_Organism ($o_date/$o_db) Chained Alignments
## group compGeno
## priority 140
## visibility hide
## color 100,50,0
## altColor 255,240,200
## matrix 16 91,-90,-25,-100,-90,100,-100,-25,-25,-100,100,-90,-100,-25,-90,91
## spectrum on
## type chain fr2
## otherDb fr2

## track netFr2
## shortLabel $o_db Net
## longLabel $o_Organism ($o_date/$o_db) Alignment Net
## group compGeno
## priority 140.1
## visibility hide
## spectrum on
## type netAlign fr2 chainFr2
## otherDb fr2

## To see how the matching did, let's use featureBits to find percentage of matching bases.
## This compares the gasAcu1 entries to only those in the Fr2 chain links that were created.
    ssh hgwdev
    cd /cluster/data/gasAcu1/bed
    ln -s blastz.fr2.2007-01-23 blastz.fr2
    cd blastz.fr2
    time nice -n 19 featureBits gasAcu1 chainFr2Link \
	> fb.gasAcu1.chainFr2Link.txt 2>&1 &
    #   153570946 bases of 446627861 (34.385%) in intersection

## In addition, we want to put the chains and nets from gasAcu1 onto the
## fr2 browser, done below:
    # This is going to make the link back on the fr2 browser to
    # gasAcu1.
    mkdir /cluster/data/fr2/bed/blastz.gasAcu1.swap
    cd /cluster/data/fr2/bed/blastz.gasAcu1.swap
    time doBlastzChainNet.pl \
	/cluster/data/gasAcu1/bed/blastz.fr2.2007-01-23/DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -bigClusterHub=pk > swap.log 2>&1 &
    #	real    24m33.761s
    #	user    0m0.125s
    #	sys     0m0.080s

    ## Now let's see how well these were matched back to Fr2.
    ssh hgwdev
    cd /cluster/data/fr2/bed/blastz.gasAcu1.swap
    time nice -n 19 featureBits fr2 chainGasAcu1Link \
	> fb.fr2.chainGasAcu1Link.txt 2>&1 &
    # 158383996 bases of 393312790 (40.269%) in intersection

###########################################################################
#  Manually fixing up the fr2 alignments to gasAcu1 to get chrUn results
##	(DONE - 2007-01-31 - 2007-02-02 - Hiram)
##	Ran the blastz on fr2 and aligned just the chrUn sequence to fr2
    ##	swap back to gasAcu1
    mkdir /cluster/data/gasAcu1/bed/blastz.fr2.swap
    cd /cluster/data/gasAcu1/bed/blastz.fr2.swap
    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/fr2/bed/blastz.gasAcu1.2007-01-31/DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -stop=chainMerge -bigClusterHub=pk > swap.log 2>&1 &

    ## Now, with that chain result in hand, place it manually back in with
    ##	the full chroms chains and re-run the nets and so forth.
    ssh kkstore05
    cd /cluster/data/gasAcu1/bed/blastz.fr2.2007-01-23/axtChain
    #	restore the chain directory which was removed by cleanUp.csh
    chainSplit chain gasAcu1.fr2.all.chain.gz
    #	copy into here the chrUn.chain result
    cd chain
    cp -p \
	/cluster/data/gasAcu1/bed/blastz.fr2.swap/axtChain/chain/chrUn.chain .
    time nice -n +19 chainMergeSort *.chain \
	| gzip -c > ../gasAcu1.fr2.all.chain.gz
    cd ..
    rm -fr chain
    #	they have new IDs
    chainSplit chain gasAcu1.fr2.all.chain.gz
    #	And then reusing the netChains.csh script, fixing up references
    #	to the appropriate chrom.sizes and 2bit file for gasAcu1
    cp netChains.csh netChainsUn.csh
    time nice -n +19 ./netChainsUn.csh
    #	real    30m9.385s

    ## reloading, reuse the load.csh script, fixup reference to correct
    ##	chrom.sizes
    ssh hgwdev
    cd /cluster/data/gasAcu1/bed/blastz.fr2.2007-01-23/axtChain
    time nice -n +19 ./reLoad.csh > reLoad.out 2>&1
    #	real    3m53.174s

    nice -n +19 featureBits gasAcu1 chainFr2Link \
	> fb.gasAcu1.chainFr2Link.txt 2>&1
    #	167817405 bases of 446627861 (37.574%) in intersection
    #	without chrUn covered before, it was:
    #	153570946 bases of 446627861 (34.385%) in intersection

    #	and fr2 is covered:
    nice -n +19 featureBits fr2 chainGasAcu1Link \
	> fb.fr2.chainGasAcu1Link.txt 2>&1
    #	158383996 bases of 393312790 (40.269%) in intersection

###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-01-27)
    ssh kkstore05
    bash # if not using bash shell already

    mkdir /cluster/data/gasAcu1/blastDb
    cd /cluster/data/gasAcu1
    awk '/chrUn/ {if ($5 == "W") print $6}' downloads/UCSC.gasAcu1.agp > chrUnContigs.lst
    zcat downloads/assembly.bases.gz | faSomeRecords stdin chrUnContigs.lst chrUnContigs.fa
    cat noUn/chr*fa > temp.fa
    faSplit gap temp.fa 1000000 blastDb/
    faSplit sequence chrUnContigs.fa 100 blastDb/y
    rm temp.fa
    cd blastDb
    for i in *.fa
    do
	/cluster/bluearc/blast229/formatdb -i $i -p F
    done
    rm *.fa

    mkdir -p /san/sanvol1/scratch/gasAcu1/blastDb
    cd /cluster/data/gasAcu1/blastDb
    for i in nhr nin nsq; 
    do 
	echo $i
	cp *.$i /san/sanvol1/scratch/gasAcu1/blastDb
    done

    mkdir -p /cluster/data/gasAcu1/bed/tblastn.hg18KG
    cd /cluster/data/gasAcu1/bed/tblastn.hg18KG
    echo  /san/sanvol1/scratch/gasAcu1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 609 query.lst

   # we want around 200000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\)

# 36727/(200000/609) = 111.833715

   mkdir -p /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/kgfa
   split -l 112 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/kgfa/kg
   ln -s /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/kgfa kgfa
   cd kgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
     done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   mkdir -p /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/blastOut
   ln -s /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/gasAcu1/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/gasAcu1/blastDb.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3

        if pslCheck -prot $3.tmp                                                  
        then                                                                      
            mv $3.tmp $3                                                          
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0                                                                    
    fi                                                                            
fi                                                                                
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec
    exit # back to bash
    
    ssh pk
    cd /cluster/data/gasAcu1/bed/tblastn.hg18KG
    para create blastSpec
#    para try, check, push, check etc.

    para time

# Completed: 199752 of 199752 jobs
# CPU time in finished jobs:    4730447s   78840.78m  1314.01h   54.75d  0.150 y
# IO & Wait Time:                929809s   15496.82m   258.28h   10.76d  0.029 y
# Average job time:                  28s       0.47m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             519s       8.65m     0.14h    0.01d
# Submission to last job:         65891s    1098.18m    18.30h    0.76d

    ssh kkstore05
    cd /cluster/data/gasAcu1/bed/tblastn.hg18KG
    tcsh
    mkdir chainRun
    cd chainRun
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=100000 stdin /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne
    ls -1dS /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    ssh kk
    cd /cluster/data/gasAcu1/bed/tblastn.hg18KG/chainRun
    para create chainSpec
    para maxNode 30
    para try, check, push, check etc.


# Completed: 328 of 328 jobs
# CPU time in finished jobs:      25906s     431.77m     7.20h    0.30d  0.001 y
# IO & Wait Time:                 77171s    1286.18m    21.44h    0.89d  0.002 y
# Average job time:                 314s       5.24m     0.09h    0.00d
# Longest finished job:             942s      15.70m     0.26h    0.01d
# Submission to last job:          3630s      60.50m     1.01h    0.04d

    ssh kkstore05
    cd /cluster/data/gasAcu1/bed/tblastn.hg18KG/blastOut
    bash # if using another shell
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/gasAcu1/bed/tblastn.hg18KG/unliftBlastHg18KG.psl
    cd ..
    pslCheck unliftBlastHg18KG.psl
    liftUp blastHg18KG.psl ../../Un/chrUn.lft carry unliftBlastHg18KG.psl

    # load table 
    ssh hgwdev
    cd /cluster/data/gasAcu1/bed/tblastn.hg18KG
    hgLoadPsl gasAcu1 blastHg18KG.psl

    # check coverage
    featureBits gasAcu1 blastHg18KG 
# 20127551 bases of 446627861 (4.507%) in intersection

    featureBits gasAcu1 ensGene:cds blastHg18KG  -enrichment
# ensGene:cds 6.618%, blastHg18KG 4.507%, both 3.790%, cover 57.26%, enrich 12.71x

    ssh kkstore05
    rm -rf /cluster/data/gasAcu1/bed/tblastn.hg18KG/blastOut
    rm -rf /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/blastOut
#end tblastn

#######################################################################
## BLASTZ LIZARD ANOCAR1 - (DONE - 2007-02-18 - Hiram)
    ssh kkstore05
    mkdir /cluster/data/gasAcu1/bed/blastz.anoCar1.2007-02-18
    cd /cluster/data/gasAcu1/bed/blastz.anoCar1.2007-02-18

    cat << '_EOF_' > DEF
# stickleback vs. lizard

# Use same params as used for gasAcu1-danRer4
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Stickleback gasAcu1 - largest chunk big enough
#	for all chroms except chrUn
SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.2bit
SEQ1_LEN=/cluster/data/gasAcu1/chrom.sizes
SEQ1_CHUNK=40000000
SEQ1_LAP=10000

# QUERY: Lizard AnoCar1 - largest chunk big enough for largest scaffold
SEQ2_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit
SEQ2_LEN=/san/sanvol1/scratch/anoCar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/cluster/data/gasAcu1/bed/blastz.anoCar1.2007-02-18
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 DEF \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-blastzOutRoot=/san/sanvol1/scratch/gasAcu1AnoCar1 > do.log 2>&1 &
    time doBlastzChainNet.pl -verbose=2 DEF \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-blastzOutRoot=/san/sanvol1/scratch/gasAcu1AnoCar1 \
	-continue=cleanup > cleanup.log 2>&1 &

    ## measuring
    time nice -n +19 featureBits gasAcu1 chainAnoCar1Link \
	> fb.gasAcu1.chainAnoCar1Link.txt 2>&1
    #	56386298 bases of 446627861 (12.625%) in intersection

    ## swap documented in anoCar1.txt
    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/gasAcu1/bed/blastz.anoCar1.2007-02-19/DEF \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap > swap.log 2>&1 &

    time nice -n +19 featureBits anoCar1 chainGasAcu1Link \
	> fb.anoCar1.chainGasAcu1Link.txt 2>&1  
    #	real    1m14.245s
    #	54464074 bases of 1741478929 (3.127%) in intersection


#######################################################################
# SELF CHAINS (DONE 3/14/07 angie)
# Requested by Gill.
    ssh kkstore05
    mkdir /cluster/data/gasAcu1/bed/blastz.gasAcu1.2007-03-13
    cd /cluster/data/gasAcu1/bed/blastz.gasAcu1.2007-03-13
    cat << '_EOF_' > DEF
# stickleback vs. self

BLASTZ_H=2000
BLASTZ_M=200

# TARGET
# Stickleback
SEQ1_DIR=/iscratch/i/gasAcu1/gasAcu1.noUn.sdTrf.2bit
SEQ1_LEN=/iscratch/i/gasAcu1/gasAcu1.noUn.sdTrf.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Stickleback
SEQ2_DIR=/iscratch/i/gasAcu1/gasAcu1.noUn.sdTrf.2bit
SEQ2_LEN=/iscratch/i/gasAcu1/gasAcu1.noUn.sdTrf.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=10000

BASE=/cluster/data/gasAcu1/bed/blastz.gasAcu1.2007-03-13
TMPDIR=/scratch/tmp
'_EOF_'
    # << emacs
    doBlastzChainNet.pl DEF \
      -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
      -chainMinScore=5000 -chainLinearGap=medium \
      -blastzOutRoot=/cluster/bluearc/gasAcu1Self >& do.log &
    tail -f do.log
    # hgwdev can't see /iscratch, so the loadUp.csh script failed looking
    # for SEQ1_LEN.  So I edited DEF to use /cluster/data location and 
    # continued -load.

##########################################################################
# NSCAN track - (2007-06-05 markd)
# uses danRer4 as informant, no pseudogene masking 
#
    cd /cluster/data/gasAcu1/bed/nscan/

    # obtainedf NSCAN predictions from michael brent's group
    # at WUSTL
    wget -nv http://mblab.wustl.edu/predictions/Gasterosteus_aculeatus/gasAcu1.gtf
    wget -nv http://mblab.wustl.edu/predictions/Gasterosteus_aculeatus/gasAcu1.prot.fa

    gzip -9 gasAcu1.*
    chmod a-w *.gz

    # load tracks.  Note that these have *utr features, rather than
    # exon features.  currently ldHgGene creates separate genePred exons
    # for these.
    ldHgGene -bin -gtf -genePredExt gasAcu1 nscanGene gasAcu1.gtf.gz
    hgPepPred gasAcu1 generic nscanPep gasAcu1.prot.fa.gz
    rm -f *.tab

    # update trackDb; need a gasAcu1-specific page to describe informants
    stickleback/gasAcu1/nscanGene.html    
    stickleback/gasAcu1/trackDb.ra

#########################################################################
# BLASTZ SWAP from Mouse Mm9 (DONE - 2007-09-07 - Hiram)

    #	the original
    cd /cluster/data/mm9/bed/blastzGasAcu1.2007-09-06
    cat fb.mm9.chainGasAcu1Link.txt
    #	48448585 bases of 2620346127 (1.849%) in intersection

    #	and the swap
    mkdir /cluster/data/gasAcu1/bed/blastz.mm9.swap
    cd /cluster/data/gasAcu1/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 \
	/cluster/data/mm9/bed/blastzGasAcu1.2007-09-06/DEF \
	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
	-swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
    cat fb.gasAcu1.chainMm9Link.txt
    #	43730193 bases of 446627861 (9.791%) in intersection

#########################################################################
############################################################################
# TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30

see doc/builds.txt for specific details.

############################################################################
# SWAP BLASTZ Medaka oryLat2 (DONE - 2008-08-27 - Hiram)
    ssh kkstore02	# not too important since everything moved to hive
    screen # use screen to control this job
    cd /cluster/data/oryLat2/bed/blastz.gasAcu1.2008-08-26
    cat fb.oryLat2.chainGasAcu1Link.txt
    #	211589769 bases of 700386597 (30.210%) in intersection

    #	And, for the swap:
    mkdir /cluster/data/gasAcu1/bed/blastz.oryLat2.swap
    cd /cluster/data/gasAcu1/bed/blastz.oryLat2.swap
    time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium -swap \
	/cluster/data/oryLat2/bed/blastz.gasAcu1.2008-08-26/DEF \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-verbose=2 -smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 &
    #	real    31m44.711s
    cat fb.gasAcu1.chainOryLat2Link.txt
    #	189793612 bases of 446627861 (42.495%) in intersection

############################################################################
# add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram)
    hgsql -e 'update dbDb set
sourceName="Broad Institute gasAcu1 (1.0) (NCBI project 13579, AANH01000000)" where name="gasAcu1";' hgcentraltest
###########################################################################
############################################################################
# TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01

see doc/builds.txt for specific details.
############################################################################
# BLASTZ/CHAIN/NET TetNig2 (DONE - 2009-08-10,09-15 - Hiram)
    #	create contigs only sequence to align properly to gasAcu1 contigs
    mkdir /hive/data/genomes/gasAcu1/nonBridged
    cd /hive/data/genomes/gasAcu1/nonBridged
    gapToLift -verbose=2 gasAcu1 gasAcu1.contigs.lift \
	-bedFile=gasAcu1.contigs.bed
#       chrom count: 23
# WARNING: gap at end of chromosome not telomere at
#		chrUn:62549211-62550211, type: clone
#       found 16945 gaps
#       bed output requested to gasAcu1.contigs.bed
#       no gaps on chrom: chrM, size: 15742
    ~/kent/src/hg/utils/lft2BitToFa.pl ../gasAcu1.2bit gasAcu1.contigs.lift \
	| gzip -c > gasAcu1.contigs.fa.gz
    #	make sure nothing was destroyed:
    faCount *.fa.gz > faCount.contigs.txt 2>&1
    twoBitToFa ../gasAcu1.2bit stdout | faCount stdin > faCount.2bit.txt 2>&1
    tail -1 faCount.contigs.txt
# total   461441448       123670916       99610982        99564587
#	123781376       14813587        14615136
    tail -1 faCount.2bit.txt
# total   463354448       123670916       99610982        99564587
#	123781376       16726587        14615136

    #	only the total size and N count are different
    faToTwoBit gasAcu1.contigs.fa.gz gasAcu1.contigs.2bit
    twoBitInfo gasAcu1.contigs.2bit stdout | sort -k2nr > gasAcu1.contigs.sizes
    cp -p gasAcu1.contigs.2bit gasAcu1.contigs.sizes gasAcu1.contigs.lift \
	/hive/data/staging/data/gasAcu1


    mkdir /hive/data/genomes/gasAcu1/bed/lastzTetNig2.2009-08-10
    cd /hive/data/genomes/gasAcu1/bed/lastzTetNig2.2009-08-10

    cat << '_EOF_' > DEF
# Stickleback vs. Tetraodon

# TARGET: Stickleback gasAcu1, chunk large enough to run largest piece
SEQ1_DIR=/scratch/data/gasAcu1/gasAcu1.2bit
SEQ1_LEN=/scratch/data/gasAcu1/chrom.sizes
SEQ1_CTGDIR=/scratch/data/gasAcu1/gasAcu1.contigs.2bit
SEQ1_CTGLEN=/scratch/data/gasAcu1/gasAcu1.contigs.sizes
SEQ1_LIFT=/scratch/data/gasAcu1/gasAcu1.contigs.lift
SEQ1_CHUNK=22000000
SEQ1_LAP=10000
SEQ1_LIMIT=50

# QUERY: Tetraodon TetNig2 - single chunk big enough to run single largest item
SEQ2_DIR=/scratch/data/tetNig2/tetNig2.2bit
SEQ2_LEN=/scratch/data/tetNig2/chrom.sizes
SEQ2_CTGDIR=/scratch/data/tetNig2/tetNig2.contigs.2bit
SEQ2_CTGLEN=/scratch/data/tetNig2/tetNig2.contigs.sizes
SEQ2_LIFT=/scratch/data/tetNig2/tetNig2.contigs.lift
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=50

BASE=/hive/data/genomes/gasAcu1/bed/lastzTetNig2.2009-08-10
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	about 72 minutes
    #	forgot to indicate type of repeats, continuing the load:
    cd /hive/data/genomes/gasAcu1/bed/lastzTetNig2.2009-08-10/axtChain
    netClass -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
        -verbose=0 -noAr noClass.net gasAcu1 tetNig2 gasAcu1.tetNig2.net
    netFilter -minGap=10 gasAcu1.tetNig2.net \
	| hgLoadNet -verbose=0 gasAcu1 netTetNig2 stdin
    cd ..
    featureBits gasAcu1 chainTetNig2Link >&fb.gasAcu1.chainTetNig2Link.txt
    cat fb.gasAcu1.chainTetNig2Link.txt
    #	134497679 bases of 446627861 (30.114%) in intersection

    #	and finish the downloads
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-continue=download > downloads.log 2>&1

    mkdir /hive/data/genomes/tetNig2/bed/blastz.gasAcu1.swap
    cd /hive/data/genomes/tetNig2/bed/blastz.gasAcu1.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/gasAcu1/bed/lastzTetNig2.2009-08-10/DEF \
	-swap -qRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> swap.log 2>&1 &

############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.
#########################################################################
# LASTZ/CHAIN/NET swap danRer6 (DONE - 2009-12-21 - Galt)
    # original alignment to danRer6
    cd /hive/data/genomes/danRer6/bed/lastzGasAcu1.2009-12-21
    cat fb.danRer6.chainGasAcu1Link.txt
    #  147479454 bases of 1506896106 (9.787%) in intersection

    #   running the swap - DONE - 2009-12-21
    mkdir /hive/data/genomes/gasAcu1/bed/blastz.danRer6.swap
    cd /hive/data/genomes/gasAcu1/bed/blastz.danRer6.swap
    time nice +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/danRer6/bed/lastzGasAcu1.2009-12-21/DEF \
        -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -swap >& swap.log &
    cat fb.gasAcu1.chainDanRer6Link.txt
    #  114451338 bases of 446627861 (25.626%) in intersection

#######################################################################
# lastz swap danRer7 (DONE - 2010-12-17 - Hiram
    #	original alignment to danRer7
    cd /hive/data/genomes/danRer7/bed/lastzGasAcu1.2010-12-17
    cat fb.danRer7.chainGasAcu1Link.txt 
    #	144782335 bases of 1409770109 (10.270%) in intersection

    #	running the swap
    mkdir /hive/data/genomes/gasAcu1/bed/blastz.danRer7.swap
    cd /hive/data/genomes/gasAcu1/bed/blastz.danRer7.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/danRer7/bed/lastzGasAcu1.2010-12-17/DEF \
	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-swap > swap.log 2>&1 &
    #	real    25m35.024s
    cat fb.gasAcu1.chainDanRer7Link.txt 
    #	122918766 bases of 446627861 (27.522%) in intersection

#########################################################################
# lastz swap lizard anoCar2 (DONE - 2011-04-27 - Hiram)
    # the original alignment
    cd /cluster/data/anoCar2/bed/lastzGasAcu1.2011-04-26
    cat fb.anoCar2.chainGasAcu1Link.txt 
    #	53478872 bases of 1701353770 (3.143%) in intersection

    ## and running the swap
    mkdir /hive/data/genomes/gasAcu1/bed/blastz.anoCar2.swap
    cd /hive/data/genomes/gasAcu1/bed/blastz.anoCar2.swap
    time nice -n +25 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/anoCar2/bed/lastzGasAcu1.2011-04-26/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-syntenicNet -workhorse=hgwdev -smallClusterHub=encodek \
	-bigClusterHub=swarm -tRepeats=windowmaskerSdust \
	-swap -qRepeats=windowmaskerSdust > swap.log 2>&1 &
    #	real    15m28.724s
    cat fb.gasAcu1.chainAnoCar2Link.txt 
    #	55990919 bases of 446627861 (12.536%) in intersection

#######################################################################
## WINDOWMASKER (DONE - 2011-12-01 - Hiram)
    #	this was actually run in 2006 without the extra step of eliminating
    #	the gap masking.  The table was not reloaded with this result since
    #	the old one is on the RR.  This business was done to run an experiment
    #	with the fr3 lastz to see if it actually matters or not if the
    #	query genome here is more masked or less.
    mkdir /hive/data/genomes/gasAcu1/bed/windowMasker
    cd /hive/data/genomes/gasAcu1/bed/windowMasker
    time doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev gasAcu1 > do.log 2>&1 &
    #	real    21m10.950s

    # Masking statistics
    twoBitToFa gasAcu1.wmsk.sdust.2bit stdout | faSize stdin
    #	463354448 bases (16726587 N's 446627861 real 349071712 upper
    #	97556149 lower) in 23 sequences in 1 files
    #	%21.05 masked total, %21.84 masked real

    hgLoadBed gasAcu1 windowmaskerSdust windowmasker.sdust.bed.gz
    # Loaded 2285358 elements of size 3

    featureBits -countGaps gasAcu1 windowmaskerSdust
    #	114282736 bases of 463354448 (24.664%) in intersection

    #	eliminate the gaps from the masking
    featureBits gasAcu1 -not gap -bed=notGap.bed
    #	446627861 bases of 446627861 (100.000%) in intersection
    time nice -n +19 featureBits gasAcu1 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #	97556149 bases of 446627861 (21.843%) in intersection

    featureBits -countGaps gasAcu1 cleanWMask.bed.gz
    #	97556149 bases of 463354448 (21.054%) in intersection

    zcat cleanWMask.bed.gz \
	| twoBitMask ../../gasAcu1.unmasked.2bit stdin \
	    -type=.bed gasAcu1.cleanWMSdust.2bit
    twoBitToFa gasAcu1.cleanWMSdust.2bit stdout | faSize stdin \
        > gasAcu1.cleanWMSdust.faSize.txt
    cat gasAcu1.cleanWMSdust.faSize.txt
    #	463354448 bases (16726587 N's 446627861 real 349071712 upper
    #	97556149 lower) in 23 sequences in 1 files
    #	%21.05 masked total, %21.84 masked real

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps gasAcu1 rmsk cleanWMask.bed.gz
    #	13193723 bases of 463354448 (2.847%) in intersection

    # add TRF mask:
    twoBitMask gasAcu1.cleanWMSdust.2bit \
      -add ../simpleRepeat/trfMask.bed gasAcu1.TRF.WMSdust.2bit
    twoBitToFa gasAcu1.TRF.WMSdust.2bit stdout | faSize stdin \
        > gasAcu1.TRF.WMSdust.faSize.txt

    #	463354448 bases (16726587 N's 446627861 real 348847599 upper
    #	97780262 lower) in 23 sequences in 1 files
    #	%21.10 masked total, %21.89 masked real

##########################################################################
