# for emacs: -*- mode: sh; -*-


#	Creating the assembly for Monodelphis domestica
#	South American, Short-tailed Opossum
#	http://www.genome.gov/11510687
#	http://www.genome.gov/12512285

#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#
#  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
#  +-------------+---------------------------------+
#  | tableName   | type                            |
#  +-------------+---------------------------------+
#  | refGene     | genePred refPep refMrna         |
#  | xenoRefGene | genePred xenoRefPep xenoRefMrna |
#  | ensGene     | genePred ensPep                 |
#  | genscan     | genePred genscanPep             |
#  +-------------+---------------------------------+


#########################################################################
# DOWNLOAD SEQUENCE (DONE - 2005-08-18 - Hiram)
    ssh kkstore01
    mkdir -p /cluster/store9/monDom4/broad.mit.edu
    ln -s /cluster/store9/monDom4 /cluster/data/monDom4
    cd /cluster/data/monDom4/broad.mit.edu
    time wget --timestamping --force-directories --directory-prefix=. \
        --dont-remove-listing --recursive --level=2 --no-parent \
        --no-host-directories --cut-dirs=5 \
 ftp://ftp.broad.mit.edu/distribution/assemblies/mammals/monodelphis/monDom4/*
    #	That takes a number of hours, ending up with:
# -rw-rw-r--  1    7492963 Jan  5 18:10 Monodelphis4.0.agp
# -rw-rw-r--  1       1160 Jan  5 18:10 MappedAssembly.Table
# -rw-rw-r--  1        716 Jan  5 18:10 ForDistribution.command
# -rw-rw-r--  1       1215 Jan  5 18:10 BasicStats.out
# -rw-rw-r--  1       1512 Jan  5 18:11 ReadUsage.Table
# -rw-rw-r--  1  474573344 Jan  5 18:11 Monodelphis4.0.agp.chromosome.qual.gz
# -rw-rw-r--  1 1201437533 Jan  5 18:11 Monodelphis4.0.agp.chromosome.fasta.gz
# -rw-rw-r--  1    7647098 Jan  5 18:12 assembly.agp
# -rw-rw-r--  1   14841256 Jan  5 18:13 assembly.markup
# -rw-rw-r--  1    2838523 Jan  5 18:13 assembly.links
# -rw-rw-r--  1  474281414 Jan  5 18:13 assembly.agp.qual.gz
# -rw-rw-r--  1 1201095284 Jan  5 18:13 assembly.agp.fasta.gz
# -rw-rw-r--  1  176159300 Jan  5 18:14 assembly.unplaced
# -rw-rw-r--  1 1151905015 Jan  5 18:14 assembly.reads.gz
# -rw-rw-r--  1         43 Jan  5 18:15 source
# -rw-rw-r--  1  471180166 Jan  5 18:15 contigs.quals.gz
# -rw-rw-r--  1 1198813602 Jan  5 18:15 contigs.bases.gz
# -rw-rw-r--  1  757582328 Jan  5 18:16 unplaced.fasta.gz
# -rw-rw-r--  1 2083924258 Jan  5 18:18 unplaced.qual.gz
# -rw-rw-r--  1 1159288222 Jan  5 19:17 softmasked_monDom3.tgz

    zcat broad.mit.edu/Monodelphis4.0.agp.chromosome.fasta.gz \
	| faSplit -verbose=2 byname stdin broadSplit/

    #	combine the Broad split files into single chroms
for C in 1 2 3 4 5 6 7 8 X Un
do
    mkdir -p ${C}
    rm -f ${C}/chr${C}.fa
    echo ">chr${C}" > ${C}/chr${C}.fa
    echo -n "${C}/chr${C}.fa working ... "
    ls broadSplit/${C}.*-*.fa | sort -t"." -k2,2n | while read F
    do
        grep -v "^>" ${F} >> ${C}/chr${C}.fa
    done
    echo "done"
done

    #	create a 2bit file to get a blastz hg18 started
    #	this will be replaced later after RepeatMasker with masked
    #	sequence
    faToTwoBit ?/chr?.fa Un/chrUn.fa monDom4.2bit
    #   generate chrom.sizes list in order by size, biggest first
    #   This listing is going to be used in a variety of procedures
    #   below
    twoBitInfo monDom4.2bit stdout | sort -rn +1 > chrom.sizes
    mkdir /san/sanvol1/scratch/monDom4
    cp -p monDom4.2bit /san/sanvol1/scratch/monDom4
    cp -p chrom.sizes /san/sanvol1/scratch/monDom4
    #	split into 500K chunks for RepeatMasker
    mkdir split500K
for C in 1 2 3 4 5 6 7 8 X Un
do
    mkdir split500K/${C}
    faSplit -verbose=2 size ${C}/chr${C}.fa 500000 split500K/${C}/chr${C}_ \
        -lift=split500K/${C}.lft
done
    mkdir lifts
    cat split500K/*.lft > lifts/lift500K.lft

###########################################################################
#  RepeatMasker run (DONE 2006-02-06 - Hiram)
    ssh kk
    mkdir /cluster/data/monDom4/RMRun
    mkdir /cluster/data/monDom4/RMOut
    mkdir /cluster/data/monDom4/scripts
    cd /cluster/data/monDom4/split500K
for C in 1 2 3 4 5 6 7 8 X Un
do
    ls -1S ${C}/*.fa
done > ../RMRun/split.list

    cd /cluster/data/monDom4/RMRun
    cat << '_EOF_' > template
#LOOP
../scripts/RMOpossum $(dir1) $(root1) {check out line ../RMOut/$(dir1)/$(root1).out}
#ENDLOOP
'_EOF_'
    #	happy emacs

    cat << '_EOF_' > ../scripts/RMOpossum
#!/bin/csh -fe
/bin/mkdir -p /scratch/tmp/monDom4/$1/$2
/bin/mkdir -p ../RMOut/$1
/bin/cp ../split500K/$1/$2.fa /scratch/tmp/monDom4/$1/$2/$2.fa
pushd /scratch/tmp/monDom4/$1/$2
/cluster/bluearc/RepeatMasker060120/RepeatMasker -s \
        -species "Monodelphis domestica" $2.fa
popd
/bin/cp -p /scratch/tmp/monDom4/$1/$2/$2.fa.out $3
/bin/rm -fr /scratch/tmp/monDom4/$1/$2/*
/bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4/$1/$2
/bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4/$1
/bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4
'_EOF_'
    # happy emacs
    chmod +x ../scripts/RMOpossum

    gensub2 split.list single template jobList
    para create jobList
    para try ... check ... push ... etc ...
# Completed: 7170 of 7170 jobs
# CPU time in finished jobs:   26375191s  439586.52m  7326.44h  305.27d  0.836 y
# IO & Wait Time:               1584329s   26405.48m   440.09h   18.34d  0.050 y
# Average job time:                3900s      64.99m     1.08h    0.05d
# Longest finished job:           12608s     210.13m     3.50h    0.15d
# Submission to last job:        162734s    2712.23m    45.20h    1.88d

    #	lift results up to chrom coordinates
    ssh kkstore01
    cd /cluster/data/monDom4/RMOut
    for C in 1 2 3 4 5 6 7 8 X Un
    do
	echo "chr${C} working ... "
	(cat ${C}/chr${C}_000.out ${C}/chr${C}_0000.out 2> /dev/null; \
	    ls ${C}/chr${C}_*.out | egrep -v "_000.out|_0000.out" \
		| xargs tail --quiet --lines=+4) | \
	    liftUp ../${C}/chr${C}.fa.out ../lifts/lift500K.lft error stdin
	echo "chr${C} done"
    done
    #	the seemingly duplicate mention of _000.out and _0000.out takes
    #	the seemingly duplicate mention of _000.out and _0000.out takes
    #	care of the case where one or the other exists, but both never
    #	exist.  One will not exist and the cat will complain to stderr, hence
    #	the redirection of cat complaints to dev null.

###########################################################################
# BLASTZ HUMAN hg18 - to quickly investigate monDom4 to see if it has the same
# pile up problem that was in monDom2.  Got this started right after the
# monDom4.2bit file was made from the original Broad chrom fasta
    ssh pk
    mkdir /cluster/store9/monDom4/bed/blastzHg18.2006-02-08
    cd /cluster/store9/monDom4/bed/blastzHg18.2006-02-08

    cat << '_EOF_' > DEF
# human vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Human (hg18)
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Opossum monDom4
SEQ2_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ2_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/monDom4/bed/blastzHg18.2006-02-08
TMPDIR=/scratch/tmp
'_EOF_'
    #	happy emacs

    
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF > blastz.out 2>&1 &
XXXX - running 2006-02-08 10:06
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=chainMerge `pwd`/DEF > chainMerge.out 2>&1 &

###########################################################################
#	Initial database setup (DONE - 2006-02-06 - Hiram)
    ssh hgwdev
    cd /cluster/data/monDom4
    mkdir /gbdb/monDom4
    ln -s /cluster/data/monDom4/monDom4.2bit /gbdb/monDom4
    twoBitInfo monDom4.2bit stdout \
        | awk '{printf "%s\t%s\t/gbdb/monDom4/monDom4.2bit\n", $1, $2}' \
        > chromInfo.tab

    hgsql -e "create database monDom4;" mysql
    hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp;" \
                monDom4
    hgsql monDom4 < $HOME/kent/src/hg/lib/chromInfo.sql
    hgsql -e 'load data local infile "chromInfo.tab" into table chromInfo;' \
        monDom4
    hgsql monDom4 -N -e 'select chrom from chromInfo' > chrom.lst

    # Enter monDom4 into dbDb and defaultDb so test browser knows about
    # it:
    hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
        defaultPos, active, orderKey, genome, scientificName, \
        htmlPath, hgNearOk, hgPbOk, sourceName) \
        VALUES("monDom4", "Jan 2006", "/gbdb/monDom4", "Opossum", \
        "chr4:344283621-363415496", 1, 34, "Opossum", \
        "Monodelphis domestica", \
        "/gbdb/monDom4/html/description.html", 0, 0, \
        "Broad Inst. monDom4 Jan06");' \
        -h localhost hgcentraltest
    #	do this update later when it is time to make this be the default
    hgsql -e 'update defaultDb set name="monDom4" where genome="Opossum";' \
	hgcentraltest
    #	update default position to a HOX region:
    hgsql -e \
'update dbDb set defaultPos="chr8:293325654-293362847" where name="monDom4";' \
	hgcentraltest
	chr8:293,325,654-293,362,847

    # Make trackDb table so browser knows what tracks to expect
    mkdir /cluster/data/monDom4/html
    ln -s /cluster/data/monDom4/html /gbdb/monDom4/html
    mkdir $HOME/kent/src/hg/makeDb/trackDb/opossum/monDom4
    cd $HOME/kent/src/hg/makeDb/trackDb/opossum
    cvs add monDom4
    cd monDom4
    cp -p /cluster/data/monDom2/html/description.html ./description.html
    #	edit that description to update
    cvs add description.html
    cvs commit
    cd $HOME/kent/src/hg/makeDb/trackDb
    #	add monDom4 to the makefile, then
    make DBS=monDom4
    #	or
    make DBS=monDom4 alpha
    #	to place it on the genome-test browser

#######################################################################
# SIMPLE REPEAT [TRF] TRACK (DONE - 2006-02-08 - Hiram)
#   This can be done while the above repeat masker run is working
    ssh kkstore01
    mkdir /cluster/data/monDom4/bed/simpleRepeat
    cd /cluster/data/monDom4
    #	split into 50M chunks for trfBig
    mkdir split50M
for C in 1 2 3 4 5 6 7 8 X Un
do
    mkdir split50M/${C}
    faSplit -verbose=2 size ${C}/chr${C}.fa 50000000 split50M/${C}/chr${C}_ \
        -lift=split50M/${C}.lft
done
    cat split50M/*.lft > lifts/lift50M.lft
    cd split50M
    ls -1S */*.fa > ../bed/simpleRepeat/split.list

    #	small kluster for these larger jobs
    ssh kki
    cd /cluster/data/monDom4/bed/simpleRepeat
    mkdir trf

    cat << '_EOF_' > template
#LOOP
./runTrf.csh $(dir1) $(root1) {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
    #	happy emacs

    cat << '_EOF_' > runTrf.csh
#!/bin/csh -fe
set workDir = /scratch/tmp/monDom4TRF/$1/$2
set inputFN = ../../split50M/$1/$2.fa
set outputFN = $3
mkdir -p $workDir
cp -p $inputFN $workDir
pushd $workDir
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $2.fa /dev/null -bedAt=$2.bed -tempDir=/scratch/tmp
popd
rm -f $outputFN
cp -p $workDir/$2.bed $outputFN
rm -fr $workDir/*
rmdir --ignore-fail-on-non-empty $workDir
rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4TRF/$1
rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4TRF
'_EOF_'
    #	happy emacs
    chmod +x runTrf.csh

    gensub2 split.list single template jobList
    para create jobList
    para try ... check ... push ... etc ...
# Completed: 77 of 77 jobs
# CPU time in finished jobs:      16109s     268.48m     4.47h    0.19d  0.001 y
# IO & Wait Time:                  1199s      19.99m     0.33h    0.01d  0.000 y
# Average job time:                 225s       3.75m     0.06h    0.00d
# Longest finished job:            1322s      22.03m     0.37h    0.02d
# Submission to last job:          1870s      31.17m     0.52h    0.02d

    #	lift to chrom coordinates
    ssh kkstore01
    cd /cluster/data/monDom4/bed/simpleRepeat
    find ./trf -type f | xargs cat | \
        liftUp simpleRepeat.bed \
	    /cluster/data/monDom4/lifts/lift50M.lft \
		error stdin

     # Load into the database:
    ssh hgwdev
    cd /cluster/data/monDom4/bed/simpleRepeat
    hgLoadBed -strict monDom4 simpleRepeat simpleRepeat.bed \
        -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    #	Loaded 710087 elements of size 16

    #   Compare with previous assembly
    time featureBits monDom4 simpleRepeat
    #	68337838 bases of 3501643220 (1.952%) in intersection
    #	real    0m42.137s
    time featureBits monDom2 simpleRepeat
    #	69239555 bases of 3496445653 (1.980%) in intersection
    #	real    1m5.716s
    time featureBits monDom1 simpleRepeat
    #	55000238 bases of 3492108230 (1.575%) in intersection
    #	real    1m49.240s

#######################################################################
# PROCESS SIMPLE REPEATS INTO MASK (DONE - 2006-02-10 - Hiram)
# After the simpleRepeats track has been built, make a filtered
# version 
# of the trf output: keep trf's with period <= 12:
#	need these mask files on a per-chrom basis
    ssh kkstore01
    cd /cluster/data/monDom4/bed/simpleRepeat

    mkdir chromMask
    for C in 1 2 3 4 5 6 7 8 X Un
    do
	echo "chr${C} working ..."
	cat trf/chr${C}_*.bed | liftUp chromMask/chr${C}.bed \
	    /cluster/data/monDom4/lifts/lift50M.lft \
		error stdin
	awk '{if ($5 <= 12) print;}' chromMask/chr${C}.bed \
		> chromMask/chr${C}_Mask.bed
	echo "chr${C} done"
    done

    #	should be same result as if we had done the whole thing as a
    #	single piece:
    cat chromMask/*_Mask.bed | wc -l
    #	485364
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed | wc -l
    #	485364

#######################################################################
# MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE - 2006-02-10 - Hiram)
#	Additional masking was added later to use Damian Keefe's repeat
#	libraries.
    #	the new style binRange was in development at this time, so using
    #	binaries built today with that new functionality.  Next time
    #	these will be the standard commands
    ssh kkstore01
    cd /cluster/data/monDom4
    #	verify sequence before and after masking to ensure it doesn't
    #	get broken
    time faCount ?/chr?.fa Un/chrUn.fa
#seq    len     A       C       G       T       N       cpg
# chr1  748055161 227831941 138649610 138661137 228270299 14642174 3225777
# chr2  541556283 163736285 100133171 100127159 163985111 13574557 2505214
# chr3  526135210 161221343  95632761  95680478 160815407 12785221 2208210
# chr4  430141050 130190442  78748372  78684824 130210194 12307218 1877031
# chr5  308900514  94866760  56126597  56061671  94769331  7076155 1311015
# chr6  244784211  73686567  45271636  45267221  73515529  7043258 1216115
# chr7  262624689  81219469  47052213  47076531  81079019  6197457 1127925
# chr8  308469712  93087343  56578139  56466687  93245129  9092414 1427329
# chrX   60718501  16286269  11251258  11264518  16304629  5611827  371051
# chrUn 174229318 46556476  32785341  32891615  46354738 15641148 1620662
# total 3605614649 1088682895 662229098 662181841 1088549386 103971429 16890329
    #	real    1m51.555s

    cat << '_EOF_' > scripts/maskChroms.sh
#!/bin/sh

for C in 1 2 3 4 5 6 7 8 X Un
do
    echo "chr${C} working ..."
    $HOME/bin/x86_64/maskOutFa -soft ${C}/chr${C}.fa \
        bed/simpleRepeat/chromMask/chr${C}_Mask.bed ${C}/chr${C}_masked.fa
    $HOME/bin/x86_64/maskOutFa -softAdd ${C}/chr${C}_masked.fa \
        ${C}/chr${C}.fa.out ${C}/chr${C}_masked.fa
    echo "done"
done
'_EOF_'
    #	happy emacs
    chmod +x scripts/maskChroms.sh
    time ./scripts/maskChroms.sh

    #	should have the same sequence still
    time faCount ?/chr?_masked.fa Un/chrUn_masked.fa
#seq    len     A       C       G       T       N       cpg
chr1  748055161 227831941 138649610 138661137 228270299 14642174  3225777
chr2  541556283 163736285 100133171 100127159 163985111 13574557  2505214
chr3  526135210 161221343  95632761  95680478 160815407 12785221  2208210
chr4  430141050 130190442  78748372  78684824 130210194 12307218  1877031
chr5  308900514  94866760  56126597  56061671  94769331  7076155  1311015
chr6  244784211  73686567  45271636  45267221  73515529  7043258  1216115
chr7  262624689  81219469  47052213  47076531  81079019  6197457  1127925
chr8  308469712  93087343  56578139  56466687  93245129  9092414  1427329
chrX   60718501  16286269  11251258  11264518  16304629  5611827   371051
chrUn 174229318  46556476  32785341  32891615  46354738 15641148  1620662
total 3605614649 1088682895 662229098 662181841 1088549386 103971429 16890329

    #	after verifying they are the same, remove the old unmasked sequence
for C in 1 2 3 4 5 6 7 8 X Un
do
    rm ${C}/chr${C}.fa
    mv ${C}/chr${C}_masked.fa ${C}/chr${C}.fa
done

    #	Create new 2bit file with this masked sequence
    ls -og *.2bit
    #	-rw-rw-r--  1 901986358 Feb  8 09:29 monDom4.2bit
    #	browser will not function while this 2bit file does not exist
    rm monDom4.2bit
    faToTwoBit ?/chr?.fa Un/chrUn.fa monDom4.2bit
    #	the file becomes larger:
    ls -og *.2bit
    #	-rw-rw-r--  1 950060406 Feb 10 13:53 monDom4.2bit

    #	and load the rmsk tables
    ssh hgwdev
    cd /cluster/data/monDom4
    time $HOME/bin/i386/hgLoadOut -verbose=2 monDom4 \
	?/chr?.fa.out Un/chrUn.fa.out
#       hgLoadOut: connected to database: monDom4
# Processing 1/chr1.fa.out
# Strange perc. field -460.2 line 561938 of 1/chr1.fa.out
# Strange perc. field -395.6 line 561938 of 1/chr1.fa.out
# Loading up table chr1_rmsk
# Processing 2/chr2.fa.out
# Strange perc. field -160.2 line 234960 of 2/chr2.fa.out
# Strange perc. field -238.7 line 234960 of 2/chr2.fa.out
# bad rep range [1723, 73] line 451681 of 2/chr2.fa.out 
# Strange perc. field -6.0 line 1104797 of 2/chr2.fa.out
# Loading up table chr2_rmsk
# Processing 3/chr3.fa.out
# bad rep range [2344, 191] line 648571 of 3/chr3.fa.out 
# Loading up table chr3_rmsk
# Processing 4/chr4.fa.out
# Strange perc. field -7.0 line 7733 of 4/chr4.fa.out
# Loading up table chr4_rmsk
# Processing 5/chr5.fa.out
# Strange perc. field -6.9 line 34415 of 5/chr5.fa.out
# Strange perc. field -1.1 line 44716 of 5/chr5.fa.out
# Strange perc. field -1.1 line 44720 of 5/chr5.fa.out
# Strange perc. field -1.1 line 44722 of 5/chr5.fa.out
# Strange perc. field -1.1 line 44724 of 5/chr5.fa.out
# Strange perc. field -38.3 line 146729 of 5/chr5.fa.out
# Strange perc. field -38.3 line 146732 of 5/chr5.fa.out
# Strange perc. field -38.3 line 146734 of 5/chr5.fa.out
# Strange perc. field -38.3 line 146737 of 5/chr5.fa.out
# Strange perc. field -42.5 line 239189 of 5/chr5.fa.out
# Loading up table chr5_rmsk
# Processing 6/chr6.fa.out
# Strange perc. field -0.2 line 87360 of 6/chr6.fa.out
# Strange perc. field -0.1 line 87360 of 6/chr6.fa.out
# Strange perc. field -3960.1 line 507698 of 6/chr6.fa.out
# Strange perc. field -1856.3 line 507698 of 6/chr6.fa.out
# Loading up table chr6_rmsk
# Processing 7/chr7.fa.out
# Loading up table chr7_rmsk
# Processing 8/chr8.fa.out
# Loading up table chr8_rmsk
# Processing X/chrX.fa.out
# Loading up table chrX_rmsk
# Processing Un/chrUn.fa.out
# Strange perc. field -27.6 line 129404 of Un/chrUn.fa.out
# Strange perc. field -1.8 line 129404 of Un/chrUn.fa.out
# Strange perc. field -27.6 line 129407 of Un/chrUn.fa.out
# Strange perc. field -1.8 line 129407 of Un/chrUn.fa.out
# Strange perc. field -27.6 line 129409 of Un/chrUn.fa.out
# Strange perc. field -1.8 line 129409 of Un/chrUn.fa.out
# Strange perc. field -27.6 line 129411 of Un/chrUn.fa.out
# Strange perc. field -1.8 line 129411 of Un/chrUn.fa.out
# Strange perc. field -27.6 line 129413 of Un/chrUn.fa.out
# Strange perc. field -1.8 line 129413 of Un/chrUn.fa.out
# Loading up table chrUn_rmsk
# note: 2 records dropped due to repStart > repEnd

    #	Note the interesting coincidence of the exact same % for monDom4
    #	and monDom2
    time featureBits monDom4 rmsk
    #	1814099592 bases of 3501643220 (51.807%) in intersection
    #	real    3m57.193s
    #	time featureBits -countGaps monDom4 rmsk
    #	1814099592 bases of 3605614649 (50.313%) in intersection

    time featureBits monDom2 rmsk
    #	1811406819 bases of 3496445653 (51.807%) in intersection
    #	real    5m31.388s
    time featureBits monDom1 rmsk
    #	1775646140 bases of 3492108230 (50.847%) in intersection

#######################################################################
# GC5BASE (DONE - 2006-02-08 - Hiram)
    ssh kkstore01
    mkdir /cluster/data/monDom4/bed/gc5Base
    cd /cluster/data/monDom4/bed/gc5Base
    time hgGcPercent -verbose=2 -wigOut -doGaps -file=stdout -win=5 monDom4 \
        /cluster/data/monDom4 | wigEncode stdin gc5Base.wig gc5Base.wib

    ssh hgwdev
    cd /cluster/data/monDom4/bed/gc5Base
    mkdir /gbdb/monDom4/wib
    ln -s `pwd`/gc5Base.wib /gbdb/monDom4/wib
    hgLoadWiggle -verbose=2 monDom4 gc5Base gc5Base.wig

#######################################################################
# LOAD GAP & GOLD TABLES FROM AGP (DONE - 2006-02-10 - Hiram)
    ssh hgwdev
    cd /cluster/data/monDom4
    hgGoldGapGl -noGl monDom4 broad.mit.edu/Monodelphis4.0.agp
    #   Verify sanity of indexes
    hgsql -e "show index from gold;" monDom4
    hgsql -e "show index from gap;" monDom4
    #   Look at the Cardinality column, it should have some healthy
    #   numbers for all indices
    # For some reason, the indices do not get built correctly --
    # "show index from gap/gold" shows NULL cardinalities for chrom.  
    # Rebuild indices with "analyze table".
    hgsql -e "analyze table gold; analyze table gap;" monDom4

#######################################################################
#	LINEAGE SPECIFIC REPEATS (DONE - 2006-02-10 - Hiram)
#	Let's see if DateRepeats can do anything with this Opossum
#	business.  This was tried, but it doesn't make anything special
#	for any of the organisms and a -comp of opossum isn't allowed at
#	this time so we couldn't do this on the other organisms either to 
#	obtain any specific repeats there.
    ssh kki
    mkdir /cluster/data/monDom4/linSpecRep
    cd /cluster/data/monDom4/linSpecRep
    
    for C in 1 2 3 4 5 6 7 8 X Un
    do
	ln -s ../${C}/chr${C}.fa.out ./chr${C}.rmsk.out
    done

    #	This takes a long time to run as a script, to speed it up it has
    #	been converted to a mini cluster run
    cat << '_EOF_' > template
#LOOP
/cluster/bluearc/RepeatMasker060120/DateRepeats $(path1) -query opossum -comp human -comp mouse -comp rat -comp dog -comp dog
#ENDLOOP
'_EOF_'

    ls *.out > out.list

    gensub2 out.list single template jobList

    ssh kki
    cd /cluster/bluearc/scratch/hg/gs.18/build35/rmsk.spec
    ./runArian.sh > jobList
    para create jobList
    para try, ... check ... push ... etc ...
    #	To check if any difference is noted in the various comparison
    #	organisms:
    awk '{if(NF>10){a=NF-4;b=NF-3;c=NF-2;d=NF-1; print $a,$b,$c,$d,$NF}}' \
	chr2.rmsk.out_homo-sapiens_mus-musculus_rattus_canis-familiaris_canis-familiaris \
	    | sort | uniq -c
    #	187651 - - - - -
    #	113100 0 0 0 0 0
    #	210909 ? ? ? ? ?
    #	602832 X X X X X
    #	always the same mark for each comp organism

#######################################################################
#	Distribute files to filesystems for kluster runs
    ssh kkr1u00
    mkdir /iscratch/i/monDom4
    cd /iscratch/i/monDom4
    cp -p /cluster/data/monDom4/monDom4.2bit .
    cp -p /cluster/data/monDom4/chrom.sizes .
for R in 2 3 4 5 6 7 8
do
    rsync -a --progress /iscratch/i/monDom4/ kkr${R}u00:/iscratch/i/monDom4/
done

    ssh pk
    mkdir /san/sanvol1/scratch/monDom4
    cd /san/sanvol1/scratch/monDom4
    cp -p /cluster/data/monDom4/monDom4.2bit .
    cp -p /cluster/data/monDom4/chrom.sizes .

#######################################################################
#  HUMAN BLASTZ - this time with masked sequence
#	(DONE - 2006-02-10 - 2006-02-15 - Hiram)
    ssh pk
    mkdir /cluster/data/monDom4/bed/blastzHg18.2006-02-10
    cd /cluster/data/monDom4/bed/blastzHg18.2006-02-10

    cat << '_EOF_' > DEF
# opossum vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

# settings for more distant organism alignments
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Opossum monDom4
SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Human (hg18)
SEQ2_DIR=/scratch/hg/hg18/nib
SEQ2_LEN=/scratch/hg/hg18/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/monDom4/bed/blastzHg18.2006-02-10
TMPDIR=/scratch/tmp
'_EOF_'
    #	happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-stop=net `pwd`/DEF > blastz.out 2>&1 &
    #	running 2006-02-10

#######################################################################
#	Additional RepeatMasking - this sequence needs to be
#	additionally masked with Damian Keefe's libraries
#		(DONE - 2006-02-11 - Hiram)
    ssh kkr1u00
    mkdir /cluster/data/monDom4/RMExtra
    cd /cluster/data/monDom4/RMExtra
    wget --timestamping --force-directories --directory-prefix=. \
        --dont-remove-listing --recursive --level=2 --no-parent \
        --no-host-directories --cut-dirs=6 \
ftp://ftp.ebi.ac.uk/pub/databases/ensembl/encode/repeat_libraries/monodelphis/*
    #	picks up:
    #	-rw-r--r--  1  679203 May  9  2005 supplemental.lib
    #	-rw-r--r--  1  737274 May  9  2005 ab_initio.lib
    #	-rw-r--r--  1     452 May  9  2005 README
    cp -p supplemental.lib /iscratch/i/monDom4
    cd /iscratch/i/monDom4
    for R in 2 3 4 5 6 7 8
    do
	rsync -a --progress /iscratch/i/monDom4/ kkr${R}u00:/iscratch/i/monDom4/
    done

    ssh kk
    cd /cluster/data/monDom4/RMExtra
    cat << '_EOF_' > RMExtra.csh
#!/bin/csh -fe
/bin/mkdir -p /scratch/tmp/monDom4/$1/$2
/bin/mkdir -p ../RMOutExtra/$1
/bin/cp ../split500K/$1/$2.fa /scratch/tmp/monDom4/$1/$2/$2.fa
pushd /scratch/tmp/monDom4/$1/$2
/cluster/bluearc/RepeatMasker060120/RepeatMasker -s \
        -lib /iscratch/i/monDom4/supplemental.lib $2.fa
popd
/bin/cp -p /scratch/tmp/monDom4/$1/$2/$2.fa.out $3
/bin/rm -fr /scratch/tmp/monDom4/$1/$2/*
/bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4/$1/$2
/bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4/$1
'_EOF_'
    #	happy emacs
    chmod +x RMExtra.csh

    cat << '_EOF_' > template
#LOOP
./RMExtra.csh $(dir1) $(root1) {check out line ../RMOutExtra/$(dir1)/$(root1).out}
#ENDLOOP
'_EOF_'
    #	happy emacs

    mkdir ../RMOutExtra
    gensub2 ../RMRun/split.list single template jobList
    para create jobList
    para try ... check ... push ... etc...
    para time
# Completed: 7170 of 7170 jobs
# CPU time in finished jobs:    8540127s  142335.45m  2372.26h   98.84d  0.271 y
# IO & Wait Time:                103372s    1722.87m    28.71h    1.20d  0.003 y
# Average job time:                1206s      20.09m     0.33h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            2326s      38.77m     0.65h    0.03d
# Submission to last job:         22083s     368.05m     6.13h    0.26d
    cd /cluster/data/monDom4/RMOutExtra
    for C in 1 2 3 4 5 6 7 8 X Un
    do
	echo "chr${C} working ... "
	(cat ${C}/chr${C}_000.out ${C}/chr${C}_0000.out 2> /dev/null; \
	    ls ${C}/chr${C}_*.out | egrep -v "_000.out|_0000.out" \
		| xargs tail --quiet --lines=+4) | \
	    liftUp ./chr${C}.fa.out ../lifts/lift500K.lft error stdin
	echo "chr${C} done"
    done

    #	Add the two results together
    cd /cluster/data/monDom4/RMOut
    cp -p /cluster/data/monDom2/scripts/uniqueRMOut.csh ../scripts
    for C in 1 2 3 4 5 6 7 8 X Un
do
    echo "${C} working ..."
    cp -p ../${C}/chr${C}.fa.out ./chr${C}.tmp.out
    tail +4 ../RMOutExtra/chr${C}.fa.out >> ./chr${C}.tmp.out
    ../scripts/uniqueRMOut.csh ./chr${C}.tmp.out > chr${C}.fa.out
    rm -f ./chr${C}.tmp.out &
    echo "done"
done
    #	and then add this mask to the existing masked sequence
    for C in 1 2 3 4 5 6 7 8 X Un
do
    echo "chr${C} working ..."
    rm -f ./chr${C}.added.fa
    maskOutFa -softAdd ../${C}/chr${C}.fa ./chr${C}.fa.out ./chr${C}.added.fa
    echo "done"
done

    #	verify they are not broken compared to what was previous
    #	note the additional masked sequence in the lower case counts
    ssh kkstore01
    cd /cluster/data/monDom4/RMOut
    faSize chr?.added.fa chrUn.added.fa
    # 3605614649 bases (103971429 N's 3501643220 real 1551686450 upper
    # 1949956770 lower) in 10 sequences in 10 files
    faSize ../?/chr?.fa ../Un/chrUn.fa
    # 3605614649 bases (103971429 N's 3501643220 real 1687813668 upper
    # 1813829552 lower) in 10 sequences in 10 files

    #	If that is OK, replace the current sequence
    for C in 1 2 3 4 5 6 7 8 X Un
    do
	rm ../${C}/chr${C}.fa
	mv chr${C}.added.fa ../${C}/chr${C}.fa
    done

    #	And rebuild the 2bit file:
    faToTwoBit ?/chr?.fa Un/chrUn.fa monDom4RMExtra.2bit
    #	and switch it out from under the browser
    rm monDom4.2bit; mv monDom4RMExtra.2bit monDom4.2bit
    #	distribute to /iscratch/i/monDom4 ...

############################################################################
#  BLATSERVERS ENTRY (DONE - 2006-02-16 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("monDom4", "blat17", "17786", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("monDom4", "blat17", "17787", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

#########################################################################
# CPGISLANDS (DONE - 2006-02-16 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/monDom4/bed/cpgIsland
    cd /cluster/data/monDom4/bed/cpgIsland

    # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
    cvs co hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    make
    #	gcc readseq.c cpg_lh.c -o cpglh.exe
    cd ../..
    ln -s hg3rdParty/cpgIslands/cpglh.exe .
    
    ssh kkstore01
    cd /cluster/data/monDom4
    mkdir tmp
    # cpglh.exe requires hard-masked (N) .fa's.  
    time for CHR in ?/chr?.fa ??/chr??.fa
    do
	echo "maskOutFa ${CHR} hard ${CHR}.masked"
	maskOutFa ${CHR} hard ${CHR}.masked
    done > tmp/hardMask.out 2>&1
    #	about 3 minutes 20 seconds

    # There may be warnings about "bad character" for IUPAC ambiguous 
    # characters like R, S, etc.  Ignore the warnings.  
    cd /cluster/data/monDom4/bed/cpgIsland
    for F in ../../*/chr*.fa.masked
    do
	FA=${F/*\/}
	C=${FA/.fa.masked/}
	echo "./cpglh.exe ${FA} > ${C}.cpg"
	./cpglh.exe ${F} > ${C}.cpg
    done > cpglh.out 2>&1 &
    #	about 4 minutes, there were no warnings

    # Transform cpglh output to bed +
    cat << '_EOF_' > filter.awk
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
    #	happy emacs
    awk -f filter.awk chr*.cpg | sort -k1,1 -k2,2n > cpgIsland.bed

    ssh hgwdev
    cd /cluster/data/monDom4/bed/cpgIsland
    hgLoadBed -strict monDom4 cpgIslandExt -tab -noBin \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
    #	Reading cpgIsland.bed
    #	Loaded 22409 elements of size 10
    featureBits monDom4 cpgIslandExt
    #	14526908 bases of 3501643220 (0.415%) in intersection

#########################################################################
# ANDY LAW CPGISSLANDS (DONE - 2006-02-16 - Hiram)
    # See notes in makeGalGal2.doc and makeCanFam2.doc
    ssh kkstore01
    mkdir /cluster/data/monDom4/bed/cpgIslandGgfAndy
    cd /cluster/data/monDom4/bed/cpgIslandGgfAndy

    #	Build the preProcGgfAndy program in
    #	kent/src/oneShot/preProcGgfAndy into your ~/bin/$MACHTYPE

    # Use masked sequence since this is a mammal...
    for F in ../../*/chr*.fa.masked
    do
	FA=${F/*\/}
	C=${FA/.fa.masked/}
	echo preproc and run on masked "${C} ${F}" 1>/dev/stderr
	~/bin/$MACHTYPE/preProcGgfAndy ${F} \
	| /cluster/home/angie/ggf-andy-cpg-island.pl \
	| perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g1,$oE) = split("\t"); $s--;
                   $gc=$c+$g1;  $pCpG=(100.0 * 2 * $cpg / $n);
                   $pGc=(100.0 * $gc / $n);
                   $_="'${C}'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" .
                        "$pCpG\t$pGc\t$oE\n";'
    done | sort -k1,1 -k2,2n > cpgIslandGgfAndyMasked.bed
    #	about 3 minutes

    # load into database:
    ssh hgwdev
    cd /cluster/data/monDom4/bed/cpgIslandGgfAndy
    sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql
    hgLoadBed -strict monDom4 cpgIslandGgfAndyMasked -tab -noBin \
      -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed
    #	Loaded 67504 elements of size 10
    featureBits monDom4 cpgIslandExt
    #	14526908 bases of 3501643220 (0.415%) in intersection
    featureBits monDom4 cpgIslandGgfAndyMasked
    #	41042229 bases of 3501643220 (1.172%) in intersection
    wc -l ../cpgIsland/cpgIsland.bed *bed
    #	22409 ../cpgIsland/cpgIsland.bed
    #	67504 cpgIslandGgfAndyMasked.bed

#######################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2006-02-24 - Hiram)
    ssh kkstore01
    cd /cluster/data/monDom4
    #	use repMatch of 1228 as this genome is ~ %20 larger than human
    #	1024 + (1024 * 0.2) = 1228
    time blat monDom4.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1228
    #	Wrote 43992 overused 11-mers to 11.ooc
    #	real    3m24.793s

    #	copy to bluearc filesystem for kluster use
    cp -p 11.ooc /cluster/bluearc/scratch/hg/monDom4

#########################################################################
# GENBANK auto update (DONE - 2006-02-24 - Hiram)
    # align with revised genbank process. drop xeno ESTs.
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvs update -d -P etc
    #edit etc/genbank.conf to add monDom4, it is a copy of monDom2 with changes:

# monDom4 (M. domestica)  8 chroms plus X and Un
monDom4.serverGenome = /cluster/data/monDom4/monDom4.2bit
monDom4.clusterGenome = /cluster/bluearc/scratch/hg/monDom4/monDom4.2bit
monDom4.ooc = /cluster/data/monDom4/11.ooc
monDom4.lift = no
monDom4.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
monDom4.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
monDom4.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
monDom4.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
monDom4.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
monDom4.downloadDir = monDom4
monDom4.refseq.mrna.xeno.load  = yes
monDom4.refseq.mrna.xeno.loadDesc = yes
monDom4.mgcTables.default = full
monDom4.mgcTables.mgc = all

    #	check that into CVS, then
    # update /cluster/data/genbank/
    make etc-update

    ssh kkstore02
    cd /cluster/data/genbank
    nice -n 19 bin/gbAlignStep -initial monDom4 &
    #	var/build/logs/2006.02.24-22:38:50.monDom4.initalign.log

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    nice -n 19 ./bin/gbDbLoadStep -drop -initialLoad  monDom4 &
    #	var/dbload/hgwdev/logs/2006.02.26-09:36:15.dbload.log
XXXX - running 2006-02-26 09:36

##########################################################################
#  BLASTZ SELF  (DONE - 2006-03-02 - 2006-04-19 - Hiram)

    ssh kk
    mkdir /cluster/data/monDom4/bed/blastzSelf.2006-03-02
    cd /cluster/data/monDom4/bed
    ln -s blastzSelf.2006-03-02 blastz.monDom4
    cd blastzSelf.2006-03-02
    
    cat << '_EOF_' > DEF
# monDom4 vs monDom4
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

BLASTZ=blastz.v7

BLASTZ_M=100

# TARGET: Opossum monDom4
SEQ1_DIR=/iscratch/i/monDom4/monDom4.2bit
SEQ1_LEN=/iscratch/i/monDom4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: MonDom2
SEQ2_DIR=/iscratch/i/monDom4/monDom4.2bit
SEQ2_LEN=/iscratch/i/monDom4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/monDom4/bed/blastzSelf.2006-03-02
TMPDIR=/scratch/tmp
'_EOF_'
    #	happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=10000 -chainLinearGap=medium \
	`pwd`/DEF > blastz.out 2>&1 &
    #	This was a difficult run, needed to do the last job on Kolossus:
# Completed: 133955 of 133956 jobs
# CPU time in finished jobs:   78602533s 1310042.22m 21834.04h  909.75d  2.492 y
# IO & Wait Time:              47419875s  790331.25m 13172.19h  548.84d  1.504 y
# Average job time:                 941s      15.68m     0.26h    0.01d
# Longest finished job:          111202s    1853.37m    30.89h    1.29d
# Submission to last job:        521430s    8690.50m   144.84h    6.04d

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=10000 -chainLinearGap=medium \
	-continue=cat `pwd`/DEF > cat.out 2>&1 &
    #	broke during chaining, the processes are too large.  Running
    #	them individually on hgwdev64 and kolossus, they take many days
    #	each, perhaps a week, still trying to get them done after two
    #	weeks of messing with them.
    #	last one took:
    #	real    7343m55.064s
    #	user    2270m9.968s
    #	sys     5072m16.807s
    #	that's 5 days and 2 hours, about 5 weeks for all:
    #	-rw-rw-r--  1  138817724 Mar 10 17:48 monDom4.2bit:chrX:.chain
    #	-rw-rw-r--  1  500083934 Mar 11 15:06 monDom4.2bit:chr7:.chain
    #	-rw-rw-r--  1  681900033 Mar 11 20:27 monDom4.2bit:chr5:.chain
    #	-rw-rw-r--  1  495478058 Mar 11 20:53 monDom4.2bit:chr6:.chain
    #	-rw-rw-r--  1  948724278 Mar 12 12:55 monDom4.2bit:chrUn:.chain
    #	-rw-rw-r--  1  668565678 Mar 12 18:03 monDom4.2bit:chr8:.chain
    #	-rw-rw-r--  1 1214431534 Mar 23 21:39 monDom4.2bit:chr2:.chain
    #	-rw-rw-r--  1 1755465687 Apr  9 19:31 monDom4.2bit:chr4:.chain
    #	-rw-rw-r--  1 1739564535 Apr 10 15:55 monDom4.2bit:chr1:.chain
    #	-rw-rw-r--  1 2041409562 Apr 16 13:58 monDom4.2bit:chr3:.chain

    #	loading the tables on kolossus
    ssh kolossus
    cd /cluster/data/monDom4/bed/blastzSelf.2006-03-02/axtChai
    cat << '_EOF_' > loadUp.csh
#!/bin/csh -fe
cd /cluster/data/monDom4/bed/blastzSelf.2006-03-02/axtChain/chain
foreach c (`awk '{print $1;}' /cluster/bluearc/scratch/hg/monDom4/chrom.sizes`)
    set f = $c.chain
    if (! -e $f) then
      echo no chains for $c
      set f = /dev/null
    endif
    hgLoadChain monDom4 ${c}_chainSelf $f
end
'_EOF_'
    #	<< emacs
    chmod +x loadUp.csh
    time ./loadUp.csh
    #	real    211m16.852s
    #	request push of *chainSelf* tables from kolossus to hgwdev
    #	almost 32 Gb of data.

    ssh kolossus
    cd /cluster/data/monDom4/bed/blastzSelf.2006-03-02
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=10000 -chainLinearGap=medium \
	-continue=net `pwd`/DEF > net.out 2>&1 &

############################################################################
##  BLASTZ swap from mm8 alignments (DONE - 2006-02-23 - Hiram)
    ssh pk
    cd /cluster/data/mm8/bed/blastzMonDom4.2006-02-23
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        `pwd`/DEF > swap.out 2>&1 &

    time nice -n +19 featureBits monDom4 chainMm8Link
    #   210933035 bases of 3501643220 (6.024%) in intersection

############################################################################
##  BLASTZ CHICKEN galGal2 (DONE - 2006-03-06 - 2006-03-08 - Hiram)

    ssh pk
    mkdir /cluster/data/monDom4/bed/blastzGalGal2.2006-03-06
    cd /cluster/data/monDom4/bed
    ln -s blastzGalGal2.2006-03-06 blastz.galGal2
    cd blastzGalGal2.2006-03-06

    cat << '_EOF_' > DEF
# Opossum vs Chicken
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Opossum monDom4
SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Chicken galGal2 - single chunk big enough for whole chroms at once
SEQ2_DIR=/scratch/hg/galGal2/nib
SEQ2_LEN=/scratch/hg/galGal2/chrom.sizes
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/cluster/data/monDom4/bed/blastzGalGal2.2006-03-06
TMPDIR=/scratch/tmp
'_EOF_'
  # << happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF > blastz.out 2>&1 &
    #	real    1399m47.238s

    mkdir /cluster/data/galGal2/bed/blastz.monDom4.swap
    cd /cluster/data/galGal2/bed
    ln -s blastz.monDom4.swap blastz.monDom4
    cd blastz.monDom4.swap

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap /cluster/data/monDom4/bed/blastzGalGal2.2006-03-06/DEF \
	> swap.out 2>&1 &


    featureBits monDom4 chainGalGal2Link > fb.monDom4.chainGalGal2Link 2>&1
    cat fb.monDom4.chainGalGal2Link
    #	106807090 bases of 3501643220 (3.050%) in intersection
    featureBits galGal2 chainMonDom4Link > fb.galGal2.chainMonDom4Link 2>&1
    cat fb.galGal2.chainMonDom4Link
    #	95852676 bases of 1054197620 (9.092%) in intersection

############################################################################
##  BLASTZ ZEBRAFISH danRer3 (DONE - 2006-03-06 - 2003-03-08 - Hiram)
    ssh pk
    mkdir /cluster/data/monDom4/bed/blastzDanRer3.2006-03-06
    cd /cluster/data/monDom4/bed
    ln -s blastzDanRer3.2006-03-06 blastz.danRer3
    cd blastzDanRer3.2006-03-06

    #	doing only the chroms on zebrafish, leaving out the chrUn and chrNA

    cat << '_EOF_' > DEF
# Opossum vs Zebrafish
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Opossum monDom4
SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Zebrafish (danRer3)
#  large enough chunk to do complete chroms at once
SEQ2_DIR=/san/sanvol1/scratch/danRer3/chromNib
SEQ2_LEN=/san/sanvol1/scratch/danRer3/chromNib.sizes
SEQ2_CHUNK=100000000
SEQ2_LAP=0

BASE=/cluster/data/monDom4/bed/blastzDanRer3.2006-03-06
TMPDIR=/scratch/tmp
'_EOF_'
  # << happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF > blastz.out 2>&1 &
    #	common failure for unknown reason:
    #	startStep: 0, at step 5 net to stopStep 9
    #	netChains: looks like previous stage was not successful
    #		(can't find [monDom4.danRer3.]all.chain[.gz]).
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=net `pwd`/DEF > net.out 2>&1 &

    mkdir /cluster/data/danRer3/bed/blastz.monDom4.swap
    cd /cluster/data/danRer3/bed
    ln -s blastz.monDom4.swap blastz.monDom4
    cd blastz.monDom4.swap

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap /cluster/data/monDom4/bed/blastzDanRer3.2006-03-06/DEF \
	> swap.out 2>&1 &

    featureBits monDom4 chainDanRer3Link > fb.monDom4.chainDanRer3Link 2>&1
    cat fb.monDom4.chainDanRer3Link
    #	73187496 bases of 3501643220 (2.090%) in intersection
    featureBits danRer3 chainMonDom4Link > fb.danRer3.chainMonDom4Link 2>&1
    cat fb.danRer3.chainMonDom4Link
    #	65225617 bases of 1630323462 (4.001%) in intersection

##########################################################################
# MAKE Human Proteins track (WORKING - 2006-03-24 - Hiram)
    #	we need the split500K sequences to do this work, set them up on
    #	the Iservers
    ssh kkstore01
    cd /cluster/data/monDom4/split500K
    rsync -a --progress `pwd`/ kkr1u00:/iscratch/i/monDom4/split500K/
    ssh kkr1u00
    cd /iscratch/i/monDom4/split500K
    #	create individual blast db's for each sequence
    for C in 1 2 3 4 5 6 7 8 Un X
    do
	echo ${C} working ...
	cd ${C}
	for i in *.fa
	do
	    /cluster/bluearc/blast229/formatdb -p F -i $i
	done
	cd ..
	echo ${C} done
    done
    find /iscratch/i/monDom4/split500K -name "*.nsq" \
	| sed "s/\.nsq//" > target.lst        

    #	now sync everything to the other Iserver nodes
    for R in 2 3 4 5 6 7 8
    do
	rsync -a --progress `pwd`/ kkr${R}u00:/iscratch/i/monDom4/split500K/
    done

    ssh kkstore01
    mkdir -p /cluster/data/monDom4/bed/tblastn.hg18KG
    cd /cluster/data/monDom4/bed/tblastn.hg18KG

    mkdir kgfa
    # calculate number of protein sequence to give a job count of about
    #	500,000
    echo `cat /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | wc -l` \
	`cat /iscratch/i/monDom4/split500K/target.lst | wc -l` \
	| awk '{printf "%d/(500000/%d) = %d\n", $1, $2, $1/(500000/$2)}'
    #	36727/(500000/7170) = 526
    #	split the proteins by that number

    split -l 526 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg

    cd kgfa
    for i in kg??; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    #	Want output to go to bluearc to preserve sanity of file server
    rm -rf  /cluster/bluearc/monDom4/bed/tblastn.hg18KG/blastOut
    mkdir -p /cluster/bluearc/monDom4/bed/tblastn.hg18KG/blastOut
    ln -s  /cluster/bluearc/monDom4/bed/tblastn.hg18KG/blastOut .
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done

    ssh kk
    cd /cluster/data/monDom4/bed/tblastn.hg18KG
    cat /iscratch/i/monDom4/split500K/*.lft > subChr.lft

    cat << '_EOF_' > template
#LOOP
blastSome $(path1) $(path2) {check out exists blastOut/$(root2)/q.$(root1).psl}
#ENDLOOP
'_EOF_'
    #	<<	happy emacs

    #### !NOTE: Next time run these jobs with a better separation
    #	of I/O spaces, they seem to hangup bluearc while they work.
    #	Plus, they are using /tmp instead of /scratch/tmp/
    #	And they are leaving lots of garbage, the $f.3 file.
    #	Have added $f.3 to the listing below to remove it


    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
        liftUp -nosort -type=".psl" -pslQ -nohead $f.3 /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2
	liftUp -nosort -type=".psl" -nohead $3.tmp /cluster/data/monDom4/bed/tblastn.hg18KG/subChr.lft carry $f.3
        mv $3.tmp $3
        rm -f $f.1 $f.2 $f.3
        exit 0
    fi
fi
rm -f $f.1 $f.2 $f.3 $3.tmp $f.8
exit 1
'_EOF_'
    #	<<	happy emacs
    chmod +x blastSome

    cp -p /iscratch/i/monDom4/split500K/target.lst .
    gensub2 target.lst kg.lst template jobList

    #	You want to try 10 jobs and see if the following chain
    #	pipeline will work in a reasonable time.  If it doesn't the
    #	jobs are too big.  There were some hippos in the simpleChain
    #	step.

    para create jobList
    para try

    para push

# Completed: 504218 of 504218 jobs
# CPU time in finished jobs:   28863259s  481054.31m  8018.57h  334.07d  0.915 y
# IO & Wait Time:               4255045s   70918.42m  1181.96h   49.25d  0.135 y
# Average job time:                  66s       1.09m     0.02h    0.00d
# Longest finished job:             443s       7.38m     0.12h    0.01d
# Submission to last job:        166648s    2777.47m    46.29h    1.93d

    #	This simpleChain step has jobs that are too large.  They could
    #	be broken up even more as was done for the hippos that remained,
    #	see below.
    ssh kk
    cd /cluster/data/monDom4/bed/tblastn.hg18KG
    mkdir chainEm
    cd /cluster/data/monDom4/bed/tblastn.hg18KG/blastOut
    find . -name "*.psl" \
	> /cluster/data/monDom4/bed/tblastn.hg18KG/chainEm/chain.lst
    cd chainEm
    mkdir psl

    cat << '_EOF_' > template
#LOOP
./chainSome.csh $(root1) $(file2) {check out line psl/c.$(file2).$(root1).psl}
#ENDLOOP
'_EOF_'
    #	<<	happy emacs

    #	you will have to build this simpleChain binary in
    #	hg/mouseStuff/simpleChain/
    #	it will not run compiled as an x86_64 binary, only i386
    cat << '_EOF_' > chainSome.csh
#!/bin/csh -fe
cat ../blastOut/${1}/q.${2}_*.psl | \
    $HOME/bin/i386/simpleChain \
        -prot -outPsl -maxGap=250000 stdin psl/c.${2}.${1}.psl
'_EOF_'
    #	<<	happy emacs
    chmod +x chainSome.csh

    ls -1dS ../blastOut/kg?? > chain.dirs

    gensub2 chain.dirs ../../../chrom.lst template jobList 
    para create jobList
    para push
# Completed: 574 of 699 jobs
# CPU time in finished jobs:   17708635s  295143.92m  4919.07h  204.96d  0.562 y
# IO & Wait Time:               1589482s   26491.37m   441.52h   18.40d  0.050 y
# Average job time:               33620s     560.34m     9.34h    0.39d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:          222754s    3712.57m    61.88h    2.58d
# Submission to last job:        673020s   11217.00m   186.95h    7.79d

    #	Some of these ended up as hippos after a few days.
    #	Rebuild a new batch for just the hippos
    ssh kk
    cd /cluster/store9/monDom4/bed/tblastn.hg18KG/chainEm
    mkdir runHippos
    mkdir runHippos/psl
    #	from the running hippos, extract the commands:
    para running | grep command: > splitHippos.sh
    #	edit splitHippos.sh to make those lines read:
    #	./mkHippoList.csh kgaa chr1
    #	where mkHippoList.csh is:
    #	#!/bin/csh -fe
    #	ls ../blastOut/${1}/q.${2}_*.psl
    #	run that:
    ./splitHippos.sh > hippo.list
    #	split by chrom
    grep chr1_ hippo.list > chr1.list
    grep chr2_ hippo.list > chr2.list
    grep chr3_ hippo.list > chr3.list
    grep chr4_ hippo.list > chr4.list
    #	these were the only chroms in the list, then

    #	The sequence below does not produce the correct result.
    #	Instead, break up all protein results for each chrom to
    #	individual files.
    ssh hgwdev
    cd /cluster/store9/monDom4/bed/tblastn.hg18KG/chainEm
    #	Do these one at a time since they create huge amounts of data in
    #	/scratch/tmp  - after each one has been sent to kkr1u00, the
    #	data here on hgwdev can be deleted
    cat chr1.list | xargs cat > /scratch/tmp/chr1.psl
    cd /scratch/tmp
    cat << '_EOF_' > splitProteins.pl
#!/usr/bin/env perl

use strict;
use warnings;

sub usage() {
    print "usage: sort -k10 chrN.psl | ./splitProteins.pl <chrN>\n";
    print "  where chrN is : chr1, chr2, chr3, etc...\n";
    print "  one at a time please.\n";
    exit 255;
}
my $argc = scalar(@ARGV);

if ($argc != 1) { usage; }

my $chr = shift;

print "mkdir -p $chr", "\n";
print `mkdir -p $chr`, "\n";

my $curProtein="";
my $linesRead = 0;

while (my $line=<STDIN>) {
    ++$linesRead;
my ($col1, $col2, $col3, $col4, $col5, $col6, $col7, $col8, $col9, $col10, $rest) = split('\t',$line);
    if ($curProtein ne $col10) {
        if (length($curProtein) > 0) {
        close(FH);
        }
        open(FH,">$chr/$col10.psl") or die "can not open $chr/$col10.psl";
        $curProtein = $col10;
    }
    print FH $line;
    if (0 == ($linesRead % 1000)) {print ".";}
    if (0 == ($linesRead % 70000)) {print " $linesRead \n";}
}
close(FH);
'_EOF_'
    #	<< happy emacs
    chmod +x ./splitProteins.pl

    sort -k10 chr1.psl | ./splitProteins.pl chr1
    rsync -a --progress ./chr1/ kkr1u00:/iscratch/i/monDom4/simpleChain/chr1/
    
    #	After the four chroms are there, sync to all I servers
    ssh kkr1u00
    cd /iscratch/i/monDom4/simpleChain
    for R in 2 3 4 5 6 7 8
do
    rsync -a --progress --stats `pwd`/ \
	kkr${R}u00:/iscratch/i/monDom4/simpleChain/
done
    #
    #	while we are here, make up the jobList
    cat << '_EOF_' > mkJobList
#!/bin/sh

for C in 1 2 3 4
do
    M=0
    N="00"
    find ./chr${C} -type f | while read F
    do
        BN=`basename ${F}`
        echo "./oneChained.csh chr${C} ${BN} ${N} (check line+ psl/chr${C}/${N}/${BN})"
        M=`echo $M | awk '{printf "%d", $1+1}'`
        if [ "${M}" -gt 999 ]; then
            N=`echo ${N} | awk '{printf "%02d", $1+1}'`
            M=0
        fi
    done
done
'_EOF_'
    #	<< happy emacs
    chmod +x mkJobList
    ./mkJobList > jobList
    mkdir /cluster/data/monDom4/bed/tblastn.hg18KG/hippoRun
    cp jobList /cluster/data/monDom4/bed/tblastn.hg18KG/hippoRun

    ssh kk
    cd /cluster/data/monDom4/bed/tblastn.hg18KG/hippoRun
    mkdir psl
    cat << '_EOF_' > oneChained.csh
#!/bin/csh -fe
mkdir -p psl/${1}/${3}
cat /iscratch/i/monDom4/simpleChain/${1}/${2} | \
    $HOME/bin/i386/simpleChain \
        -prot -outPsl -maxGap=250000 stdin psl/${1}/${3}/${2}
'_EOF_'
    #	<< happy emacs

    #	XXXX this sequence did not work right, it breaks up individual
    #	XXXX protein results
    #	these were the only chroms in the list, then
    mkdir chr1 chr2 chr3 chr4
    cd chr1
    split -a 3 -l 100 ../chr1.list chr1
    cd ../chr2
    split -a 3 -l 100 ../chr2.list chr2
    cd ../chr3
    split -a 3 -l 100 ../chr3.list chr3
    cd ../chr4
    split -a 3 -l 100 ../chr4.list chr4
    cd ../runHippos
    cat << '_EOF_' > mkJobList.sh
#!/bin/sh

for C in 1 2 3 4
do
    for F in `(cd ../chr${C}; ls chr${C}*)`
    do
    echo \
"./chainHippos.csh ${F} chr${C} {check out line psl/c.chr${C}.${F}.psl}"
    done
done
'_EOF_'
    #	<< happy emacs
    chmod +x ./mkJobList.sh
    ./mkJobList.sh > jobList


    cat << '_EOF_' > chainHippos.csh
#!/bin/csh -fe
sed -e "s/^/..\//" ../${2}/${1} | xargs cat | \
    $HOME/bin/i386/simpleChain \
        -prot -outPsl -maxGap=250000 stdin psl/c.${2}.${1}.psl
'_EOF_'
    #	<< happy emacs
    chmod +x chainHippos.sh

    para create jobList
    para try ... check ... etc ....

    ######  Continuing with the concatenation of all the results
    ssh kkstore04
    #	running out of disk space on store9, create a parallel hierarchy
    #	on a different disk still on this server:
    mkdir -p /cluster/store8/monDom4/bed/tblastn.hg18KG/sortingResults
    cd /cluster/data/monDom4/bed/tblastn.hg18KG
    ln -s /cluster/store8/monDom4/bed/tblastn.hg18KG/sortingResults \
    	sortingResults
    #	first set of results was in blastOut
    cd blastOut
    time for i in kg??
    do 
	find ./$i -name "q.*.psl" | xargs cat \
		| awk '($13 - $12)/$11 > 0.6 {print}' \
    > /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.$i.psl
	sort -rn \
/cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.$i.psl \
	    | pslUniq stdin \
/cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/u.$i.psl
	awk '(($1 / $11) ) > 0.60 {print}' \
/cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.$i.psl \
	   > /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/m60.$i.psl
	echo $i
    done
    #	real    199m44.441s

    #	And then, adding the hippo results to those:
    cd ../hippoRun/psl
    time for C in 1 2 3 4
    do
	cd chr${C}
	for D in *
	do
	    find ./${D} -name "*.psl" | xargs cat \
		| awk '($13 - $12)/$11 > 0.6 {print}' \
> /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.kg0${C}_${D}.psl
	    sort -rn \
/cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.kg0${C}_${D}.psl \
	    | pslUniq stdin \
/cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/u.kg0${C}_${D}.psl
	awk '(($1 / $11) ) > 0.60 {print}' \
/cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.kg0${C}_${D}.psl \
> /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/m60.kg0${C}_${D}.psl
	echo chr${C}/${D}
	done
	cd ..
    done
    #	real    14m49.721s


    cd ../../sortingResults
    sort -u -k 14,14 -k 16,16n -k 18,18n u.*.psl m60* \
	> /cluster/data/monDom4/bed/tblastn.hg18KG/blastHg18KG.psl
    cd ..
    wc -l blastHg18KG.psl
    # 63264 blastHg18KG.psl

    #	what is this for ?
    cat kgfa/*fa | grep ">" | wc
    # 309494  309494 4810327

    ssh hgwdev
    cd /cluster/data/monDom4/bed/tblastn.hg18KG
    hgLoadPsl monDom4 blastHg18KG.psl
    time nice -n +19 featureBits monDom4 blastHg18KG
    #	19424941 bases of 3501643220 (0.555%) in intersection

    # back to kksilo
    rm -rf blastOut

# End tblastn

############################################################################
# SWAP CHAINS/NET RN4 (DONE 3/28/06 angie)
    ssh kkstore01
    mkdir /cluster/data/monDom4/bed/blastz.rn4.swap
    cd /cluster/data/monDom4/bed/blastz.rn4.swap
    doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.monDom4/DEF \
      -workhorse kkr8u00 >& do.log & tail -f do.log
    ln -s blastz.rn4.swap /cluster/data/monDom4/bed/blastz.rn4
    nice featureBits monDom4 chainRn4Link 
    #	198094619 bases of 3501643220 (5.657%) in intersection


#######################################################################
#  CHIMP BLASTZ - (DONE - 2006-03-29 - 2006-04-10 - Hiram)
    ssh pk
    mkdir /cluster/data/monDom4/bed/blastzPanTro2.2006-03-29
    cd /cluster/data/monDom4/bed
    ln -s blastzPanTro2.2006-03-29 blastz.panTro2
    cd blastzPanTro2.2006-03-29

    cat << '_EOF_' > DEF
# opossum vs. chimp
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

# settings for more distant organism alignments
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Opossum monDom4
SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Chimp PanTro2
SEQ2_DIR=/scratch/hg/panTro2/nib
SEQ2_LEN=/scratch/hg/panTro2/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/monDom4/bed/blastzPanTro2.2006-03-29
TMPDIR=/scratch/tmp
'_EOF_'
    # << emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF > blastz.out 2>&1 &
    # XXXX	running 2006-03-29

    # 2 chroms crashed out-of-mem, so run manually on kolossus
    ssh kolossus
    cd /cluster/data/monDom4/bed/blastz.panTro2/axtChain/run
    (chain.csh monDom4.2bit:chr4: chain/monDom4.2bit:chr4:.chain; \
    chain.csh monDom4.2bit:chr3: chain/monDom4.2bit:chr3:.chain) > chain.log &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        -continue=chainMerge \
	`pwd`/DEF > blastz.chainMerge.out 2>&1 &

    # failed during load due to mysql restart
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -continue=load \
	`pwd`/DEF > blastz.load.out 2>&1 &

    # SWAP ALIGNMENTS
    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -swap \
        -workhorse=kolossus \
        `pwd`/DEF > swap.log 2>&1 &

    # failed on file perm change on liftOver file owned by Hiram
    rm /cluster/data/monDom4/bed/liftOver/monDom4ToPanTro2.over.chain.gz
    rm /cluster/data/panTro2/bed/blastz.monDom4.swap/axtChain/noClass.net
    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -swap \
        -workhorse=kolossus \
        -continue=net \
        `pwd`/DEF > swap.net.log 2>&1 &



    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap `pwd`/DEF > swap.out 2>&1 &
    # XXXX	running 2006-04-04
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=net -swap `pwd`/DEF > swap.net.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=load -swap `pwd`/DEF > swap.load.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=cleanup -swap `pwd`/DEF > swap.cleanup.out 2>&1 &

    time nice -n +19 featureBits panTro2 chainMonDom4Link \
	> fb.panTro2.chainMonDom4Link 2>&1 &


# BLASTZ/CHAIN/NET XENTRO2 (DONE 4/22/06 angie)
    ssh kkstore04
    mkdir /cluster/data/monDom4/bed/blastz.xenTro2.2006-04-20
    cd /cluster/data/monDom4/bed/blastz.xenTro2.2006-04-20
    cat << '_EOF_' > DEF
# opossum vs. frog
BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64

# Use same params as used for mammal-xenTro1 (see makeXenTro1.doc)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Opossum monDom4
SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Frog xenTro2 - single chunk big enough to run two of the
#               largest scaffolds in one job
SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit
SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/cluster/data/monDom4/bed/blastz.xenTro2.2006-04-20
'_EOF_'
    # << emacs
    doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/monDom4XenTro2 \
      -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \
      >& do.log & tail -f do.log
    ln -s blastz.xenTro2.2006-04-20 /cluster/data/monDom4/bed/blastz.xenTro2

    ssh kolossus
    time nice -n +19 featureBits monDom4 chainXenTro2Link \
	> fb.monDom4.chainXenTro2Link 2>&1 &
    cat fb.monDom4.chainXenTro2Link
    #	8160051 bases of 3501643220 (2.232%) in intersection

###########################################################################
# CREATE OPOSSUM AND OTHER SPECIES LINEAGE-SPECIFIC REPEATS DIRECTORY
# AND NIB DIRECTORY ON THE SAN FOR BLASTZ CLUSTER RUNS 
# (DONE, 2006-04-26, hartera)
    # If there are no lineage-specific repeats for opossum and other species
    # then use all repeats.
    ssh pk
    mkdir -p /san/sanvol1/scratch/monDom4/linSpecRep.notInOthers
    foreach f (/cluster/data/monDom4/*/chr*.fa.out)
     cp -p $f \
        /san/sanvol1/scratch/monDom4/linSpecRep.notInOthers/$f:t:r:r.out.spec
    end
    # need nib files when running Blastz with lineage-specific repeats
    # so create these and move to the san.
    ssh kkstore04
    mkdir -p /cluster/data/monDom4/nib
    cd /cluster/data/monDom4
    foreach f (*/chr*.fa)
      echo "Processing $f ..."
      nice faToNib -softMask $f nib/$f:t:r.nib
    end
    # move nibs to the san as taking up a lot of space on /cluster/data/ 
    ssh pk
    mv /cluster/data/monDom4/nib /san/sanvol1/scratch/monDom4/

##########################################################################
# QUALITY (DONE - 2006-04-27 - 2006-04-28 - Hiram)
    ssh kkstore04
    #	this is actually a symlink to store8 to distribute the data on
    #	the full filesystems on kkstore04
    mkdir /cluster/data/monDom4/bed/quality
    cd /cluster/data/monDom4/bed/quality
    cp -p /cluster/data/monDom2/bed/quality/qaToWigEncode.pl .
    #	using perl script from previous build
    #	Update the file name in this file:
my $AGP_FILE="/cluster/data/monDom4/broad.mit.edu/Monodelphis4.0.agp";

    #	convert the qual.fa to wiggle single column input and then wigEncode:
	time nice -n +19 \
zcat /cluster/data/monDom4/broad.mit.edu/Monodelphis4.0.agp.chromosome.qual.gz \
	| ./qaToWigEncode.pl /dev/stdin \
	| $HOME/bin/$MACHTYPE/wigEncode -noOverlap \
		stdin quality.wig quality.wib

    #	Converted stdin, upper limit 63.00, lower limit 0.00
    #	real    96m57.628s

    #	Resulting files:
    ls -og quality.wi?
    #	-rw-rw-r--  1 3501643220 Apr 27 18:16 quality.wib
    #	-rw-rw-r--  1  322734978 Apr 27 18:16 quality.wig
    #	Note how the byte size of the .wib file is exactly equal
    #	to the number of bases in a featureBits run without -countGaps
    #	from the rmsk featureBits above:
    #	1814099592 bases of 3501643220 (51.807%) in intersection
    #	This means we have a valid quality score for every base that is
    #	not in a gap, which is what we want.  If this doesn't work out,
    #	you can turn this wiggle track into a bed file for each
    #	contiguous run of data, and then intersect those bed files with
    #	the gap track to see where they overlap.  This happened this
    #	time when a new type of gap "clone" was found to be defined in
    #	the agp file.  Previously there was only "fragment".  The perl
    #	script had to be altered to recognize this new type of gap.
    #	For example, this could be done after the table was loaded:
    ssh kolossus
    cd /tmp
    awk '{print $1}' /cluster/data/monDom4/chrom.sizes | while read C
    do
	hgWiggle -doBed -db=monDom4 -chr=${C} quality > ${C}.qual.bed
    done
    #	then to intersect:
    for C in 1 2 3 4 5 6 7 8 X Un
    do
	featureBits -chrom=chr${C} monDom4 gap chr${C}.qual.bed
    done
    #	should all be zero

    ssh hgwdev
    cd /cluster/data/monDom4/bed/quality
    ln -s `pwd`/quality.wib /gbdb/monDom4/wib
    time nice -n +19 hgLoadWiggle monDom4 quality quality.wig
    #	real    3m29.459s
    #	verify index:
    hgsql -e "show index from quality;" monDom4
    #	check that Cardinality is a number and is not NULL


####################################################################
### swap Hg18 blastz (DONE - 2006-04-28 - Hiram)
###
    ssh pk
    mkdir /cluster/data/monDom4/bed/blastz.hg18.swap
    cd /cluster/data/monDom4/bed
    ln -s blastz.hg18.swap blastz.hg18
    cd blastz.hg18.swap
    #	The original DEF file had iscratch directories for the monDom4
    #	sequence, for this one we need san directories.  This is a copy
    #	of the original one with the SEQ2_DIR and _LEN changed:
    cat << '_EOF_' > DEF
# human vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin

BLASTZ=blastz.v7

# settings for more distant organism alignments
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Human (hg18)
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Opossum monDom4
SEQ2_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ2_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzMonDom4.2006-02-13
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    cd /cluster/data/monDom4/bed/blastz.hg18.swap
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap DEF > swap.out 2>&1 &
    #	real    118m44.093s

    ssh kolossus
    cd /cluster/data/monDom4/bed/blastz.hg18.swap
    time nice -n +19 featureBits monDom4 chainHg18Link \
	> fb.monDom4.chainHg18Link 2>&1
    #	351100848 bases of 3501643220 (10.027%) in intersection

#######################################################################
### 7-Way conservation (DONE - 2006-04-28 - 2006-05-02 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/monDom4/bed/multiz7way
    cd /cluster/data/monDom4/bed/multiz7way
    cp /cluster/data/mm8/bed/multiz17way/17way.nh .

    /cluster/bin/phast/tree_doctor --prune \
	chimp_panTro2,macaque_rheMac2,rabbit_oryCun1,cow_bosTau2,dog_canFam2,armadillo_dasNov1,elephant_loxAfr1,tenrec_echTel1,tetraodon_tetNig1,fugu_fr1 \
	17way.nh > 7way.nh

    #	this leaves the tree we are working with here:
    (((((human_hg18:0.054922,
	(rat_rn4:0.081728,mouse_mm8:0.077017):0.335773):0.285925,
	monodelphis_monDom4:0.371073):0.189124,
	chicken_galGal2:0.454691):0.123297,
	xenopus_xenTro2:0.782453):0.156067,
	zebrafish_danRer3:0.938628);

    /cluster/bin/phast/draw_tree 7way.nh > 7way.ps
    /cluster/bin/phast/all_dists 7way.nh > 7way.distances.txt
    grep -y monDom4 7way.distances.txt | sort -k3,3n

    #	Print out that file for reference, and use the calculated
    #	distances in the table below to order the organisms and check
    #	the button order on the browser.  Zebrafish ends up before
    #	tetraodon and fugu on the browser despite its distance.
    #	And if you can fill in the table below entirely, you have
    #	succeeded in finishing all the alignments required.
    #
#                         featureBits chainLink measures
#                                      chainMonDom4Link      chain   linearGap
#     distance                      on MonDom4    on other   minScore
#  1  0.711920 - human hg18         (% 10.027)  (% 12.385)   5000     loose
#  2  1.014888 - chicken galGal2    (% 03.050)  (% 09.092)   5000     loose
#  3  1.069788 - mouse mm8          (% 06.024)  (% 08.245)   5000     loose
#  4  1.074499 - rat rn4            (% 05.657)  (% 07.784)   5000     loose
#  5  1.465947 - frog xenTro2       (% 02.232)  (no swap )   5000     loose
#  6  1.778189 - zebrafish danRer3  (% 02.090)  (% 04.001)   5000     loose

    cd /cluster/data/monDom4/bed/multiz7way
    #	bash shell syntax here ...
    export H=/cluster/data/monDom4/bed
    mkdir mafLinks
    for G in hg18 galGal2 mm8 rn4 xenTro2 danRer3
    do
	mkdir mafLinks/$G
	if [ ! -d ${H}/blastz.${G}/mafNet ]; then
	echo "missing directory blastz.${G}/mafNet"
	fi
	ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
    done

    #	Usually these MAFs are copied to the san for a pk kluster run.
    #	in this case they are small enough that there is no need to copy
    #	them anywhere.  The alignment can be done right here on the
    #	kkstore04 filesystem.
    #	Copy MAFs to some appropriate NFS server for kluster run
    # ssh kkstore04
    # mkdir /san/sanvol1/scratch/monDom4/multiz7way
    # cd /san/sanvol1/scratch/monDom4/multiz7way
    # time rsync -a --copy-links --progress \
    #	    /cluster/data/monDom4/bed/multiz7way/mafLinks/ .

    #	We have about 734 Mb of data here, takes ~ 30 seconds to copy

    # the autoMultiz cluster run
    ssh kk

    mkdir /cluster/data/monDom4/bed/multiz7way/autoMultiz
    cd /cluster/data/monDom4/bed/multiz7way/autoMultiz
    
    mkdir penn
    #	cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
    #	cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
    #	cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
    cp -p /cluster/bin/penn/multiz.v11.i386/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/multiz.v11.i386/multiz-tba/maf_project penn
    cp -p /cluster/bin/penn/multiz.v11.i386/multiz-tba/autoMZ penn


    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	../7way.nh > tmp.nh
    echo `cat tmp.nh` > tree-commas.nh
    echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
    sed 's/[()]//g; s/,/ /g' tree.nh > species.lst

    mkdir -p maf run
    cd run

    #	NOTE: you need to set the db properly in this script
    #	And the binDir and pairs dir

    cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = monDom4
set c = $1
set maf = $2
set binDir = /cluster/data/monDom4/bed/multiz7way/autoMultiz/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /cluster/data/monDom4/bed/multiz7way/mafLinks
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
    set in = $pairs/$s/$c.maf
    set out = $db.$s.sing.maf
    if ($s == $db) then
	continue
    endif
    if (-e $in.gz) then
	zcat $in.gz > $out
    else if (-e $in) then
	cp $in $out
    else
	echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
# << happy emacs
    chmod +x autoMultiz

cat  << '_EOF_' > template
#LOOP
autoMultiz $(root1) {check out line+ ../maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs

    awk '{print $1}' /cluster/data/monDom4/chrom.sizes > chrom.lst
    gensub2 chrom.lst single template jobList
    para create jobList
    # 10 jobs
    para try ... check ... push ... etc ...
# Completed: 10 of 10 jobs
# CPU time in finished jobs:      38783s     646.38m    10.77h    0.45d  0.001 y
# IO & Wait Time:                  1184s      19.73m     0.33h    0.01d  0.000 y
# Average job time:                3997s      66.61m     1.11h    0.05d
# Longest finished job:            8706s     145.10m     2.42h    0.10d
# Submission to last job:          8706s     145.10m     2.42h    0.10d

    #	combine results into a single file for loading and gbdb reference
    ssh kkstore04
    cd /cluster/data/monDom4/bed/multiz7way
    #	There used to be a mafFilter here with a minScore of 500, but it
    #	turns out that the scores in these maf files are pretty much
    #	useless.  They range from very large negatives to very large
    #	positives.
    time catDir maf > multiz7way.maf
    #	real    1m18.849s
    #	makes a 2 Gb file:
    #	-rw-rw-r--  1 2119554905 May  1 14:23 multiz7way.maf

    #	Create per-chrom individual maf files for downloads
    ssh kkstore04
    cd /cluster/data/monDom4/bed/multiz7way
    mkdir mafDownloads
    time for M in maf/chr*.maf
    do
	B=`basename $M`
	cp -p ${M} mafDownloads/${B}
	gzip mafDownloads/${B}
	echo ${B} done
    done
    #	real    7m21.713s
    #	deliver to downloads
    ssh hgwdev
    ln -s /cluster/data/monDom4/bed/multiz7way/mafDownloads \
	/usr/local/apache/htdocs/goldenPath/monDom4/multiz7way

    # Load into database
    ssh hgwdev
    cd /cluster/data/monDom4/bed/multiz7way
    mkdir /gbdb/monDom4/multiz7way
    ln -s /cluster/data/monDom4/bed/multiz7way/multiz7way.maf \
	/gbdb/monDom4/multiz7way
    time nice -n +19 hgLoadMaf monDom4 multiz7way
    #	Loaded 2180653 mafs in 1 files from /gbdb/monDom4/multiz7way
    #	real    3m35.280s

    time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
	-maxSize=50000 monDom4 multiz7waySummary multiz7way.maf
    #	Created 1423996 summary blocks from 5463968 components and
    #	2180653 mafs from multiz7way.maf
    #	real    5m40.531s

    # create tree image:
    cat << '_EOF_' > species.nh
(((((human,(rat,mouse)),opossum),chicken),frog),zebrafish)
'_EOF_'
    /cluster/bin/phast/draw_tree -b -s species.nh > species7.ps
    # photoshop to enhance, reduce the amount of whitespace to make it
    # smaller, then save as jpg
    ln -s /cluster/data/monDom4/bed/multiz7way/species7.jpg \
	/usr/local/apache/htdocs/images/phylo/monDom4_7way.jpg
    #	the chopping of whitespace can be done with Imagemagick
    #	'display' - use the "transform -> chop" menu, select horizontal
    #	or vertical chop, highlight an area to cut

############################################################################
# CREATE CONSERVATION WIGGLE WITH PHASTCONS
#		(DONE - 2006-05-01 - 2006-05-02 - Hiram)

    # Estimate phastCons parameters
    ssh kkstore04
    mkdir /cluster/data/monDom4/bed/multiz7way/cons
    cd /cluster/data/monDom4/bed/multiz7way/cons

    # Create a starting-tree.mod based on chr1 (the largest one)
    time /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr1.maf \
	--refseq ../../../1/chr1.fa --in-format MAF \
	--windows 100000000,1000 --out-format SS \
	--between-blocks 5000 --out-root s1
    #	real    5m50.239s

    /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
--tree "(((((hg18,(rn4,mm8)),monDom4),galGal2),xenTro2),danRer3)" \
    --out-root starting-tree
    #	real    1m11.568s

    rm s1.*.ss
    # add up the C and G:
    grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
    #	0.398
    #	This 0.398 is used in the --gc argument below

    # Create big bad bloated SS files on san filesystem (takes ~ 2h 20m)
    #	Increasing their size this time from 1,000,000 to 10,000,000 to
    #	slow down the phastCons pk jobs
    ssh kkstore04
    mkdir -p  /san/sanvol1/scratch/monDom4/cons/ss
    cd  /san/sanvol1/scratch/monDom4/cons/ss
    time for C in `awk '{print $1}' /cluster/data/monDom4/chrom.sizes`
    do
      if [ -s /cluster/data/monDom4/bed/multiz7way/maf/${C}.maf ]; then
	mkdir ${C}
	echo msa_split $C
	chrN=${C/chr/}
	chrN=${chrN/_random/}
	/cluster/bin/phast/$MACHTYPE/msa_split \
	    /cluster/data/monDom4/bed/multiz7way/maf/${C}.maf \
	    --refseq /cluster/data/monDom4/${chrN}/${C}.fa \
	    --in-format MAF --windows 10000000,0 --between-blocks 5000 \
	    --out-format SS --out-root ${C}/${C}
      fi
    done &
    #	real    28m44.827s

    # Create a random list of 50 1 mb regions  (do not use the _randoms)
    cd /san/sanvol1/scratch/monDom4/cons/ss
    ls -1l chr*/chr*.ss | grep -v random | \
	awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list

    # Set up parasol directory to calculate trees on these 50 regions
    ssh kk
    mkdir /san/sanvol1/scratch/monDom4/cons/treeRun1
    cd /san/sanvol1/scratch/monDom4/cons/treeRun1
    mkdir tree log

    #	Tuning this loop should come back to here to recalculate 
    # Create little script that calls phastCons with right arguments
    #	--target-coverage of 0.20 is about right for mouse, will be
    #	tuned exactly below
    cat > makeTree.csh << '_EOF_'
#!/bin/csh -fe
set C=$1:h
mkdir -p log/${C} tree/${C}
    /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \
      /cluster/data/monDom4/bed/multiz7way/cons/starting-tree.mod \
      --gc 0.398 --nrates 1,1 --no-post-probs --ignore-missing \
      --expected-lengths 12 --target-coverage 0.17 \
      --quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
#	<< happy emacs
      chmod a+x makeTree.csh

# Create gensub file
      cat > template << '_EOF_'
#LOOP
makeTree.csh $(path1)
#ENDLOOP
'_EOF_'
#	<< happy emacs

# Make cluster job and run it
    gensub2 ../randomSs.list single template jobList
    para create jobList
    para try/push/check/etc
# Completed: 50 of 50 jobs
# CPU time in finished jobs:      38161s     636.02m    10.60h    0.44d  0.001 y
# IO & Wait Time:                   385s       6.41m     0.11h    0.00d  0.000 y
# Average job time:                 771s      12.85m     0.21h    0.01d
# Longest finished job:            1672s      27.87m     0.46h    0.02d
# Submission to last job:          1693s      28.22m     0.47h    0.02d


# Now combine parameter estimates.  We can average the .mod files
# using phyloBoot.  This must be done separately for the conserved
# and nonconserved models
    ssh kkstore04
    cd /san/sanvol1/scratch/monDom4/cons/treeRun1
    ls -1 tree/chr*/*.cons.mod > cons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
	--output-average ../ave.cons.mod > cons_summary.txt
    ls -1 tree/chr*/*.noncons.mod > noncons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
	--output-average ../ave.noncons.mod > noncons_summary.txt
    cd ..
    cp -p ave.*.mod /cluster/data/monDom4/bed/multiz7way/cons

#	measuring entropy
#	consEntopy <target coverage> <expected lengths>
#		 ave.cons.mod ave.noncons.mod --NH 9.78
#	never stops with the --NH argument
/cluster/bin/phast/$MACHTYPE/consEntropy .17 12 --NH 9.78 \
	ave.cons.mod ave.noncons.mod
# ( Solving for new omega: 12.000000 9.952747 9.669713 9.663148 )
# Trans parameters: gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068
# Relative entropy: H=1.090778 bits/site
# Expected min. length: L_min=9.400348 sites
# Expected max. length: L_max=6.597224 sites
# Phylogenetic information threshold: PIT=L_min*H=10.253692 bits
# Recommended expected length: omega=9.663148 sites (for L_min*H=9.780000)
XXXX - working 2006-05-01 17:00


### !!! ***  This one with .17 and 12 is the one that was finally used
# consEntropy .17 12 ave.cons.mod.4 ave.noncons.mod.4
# Transition params: gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068
# Relative entropy: H=1.478838 bits/site
# Required length: N=6.753382 sites
# Total entropy: NH=9.987159 bits

#	SKIP to here passing by the tuning numbers
ssh kk
# Create cluster dir to do main phastCons run
mkdir /san/sanvol1/scratch/monDom4/cons/consRun1
cd /san/sanvol1/scratch/monDom4/cons
cp /san/sanvol1/scratch/mm8/cons/elliotsEncode.mod .
#	This file looks like:
ALPHABET: A C G T
ORDER: 0
SUBST_MOD: REV
TRAINING_LNL: -988246.132962
BACKGROUND: 0.295 0.205 0.205 0.295
RATE_MAT:
-1.165221    0.315494    0.589884    0.259843
   0.189778   -0.878194    0.208718    0.479698
   0.444622    0.261535   -0.885604    0.179447
   0.234867    0.720815    0.215191   -1.170872
TREE: (((((((((((((hg18:0.006690,panTro2:0.007571):0.024272,(colobus_monkey:0.015404,(baboon:0.008258,rheMac2:0.028617):0.008519):0.022120):0.023960,(dusky_titi:0.025662,(owl_monkey:0.012151,marmoset:0.029549):0.008236):0.027158):0.066101,(mouse_lemur:0.059024,galago:0.121375):0.032386):0.017073,((rn4:0.081728,monDom4:0.077017):0.229273,oryCun1:0.206767):0.023340):0.023026,(((bosTau2:0.159182,canFam2:0.147731):0.004946,rfbat:0.138877):0.010150,(hedgehog:0.193396,shrew:0.261724):0.054246):0.024354):0.028505,dasNov1:0.149862):0.015994,(loxAfr1:0.104891,echTel1:0.259797):0.040371):0.218400,monDom4:0.371073):0.065268,platypus:0.468116):0.123856,galGal2:0.454691):0.123297,xenTro2:0.782453):0.156067,((tetNig1:0.199381,fr1:0.239894):0.492961,danRer3:0.782561):0.156067);


    cd /san/sanvol1/scratch/monDom4/cons/consRun1
    mkdir ppRaw bed

    # Create script to run phastCons with right parameters
    #	These parameters:
    #	--rho 0.28 --expected-length 14 --target-coverage 0.008 --quiet \
    #	were taken from Kate's 17-way in Hg17, including the
    #	-not-informative panTro2
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    cat > doPhast << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ../elliotsEncode.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss elliotsEncode.mod \
   --expected-lengths 12 --target-coverage 0.17 --quiet \
	--seqname ${1} --idpref ${1} --viterbi ${2}.bed --score \
	    --require-informative 0 > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/elliotsEncode.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
    # << happy emacs
    chmod a+x doPhast

    #	root1 == chrom name, file1 == ss file name without .ss suffix
    # Create gsub file
    cat > template << '_EOF_'
#LOOP
doPhast $(root1) $(file1)
#ENDLOOP
'_EOF_'
    #	<< happy emacs

    # Create parasol batch and run it
    ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list

    gensub2 in.list single template jobList
    para create jobList
    para try/check/push/etc.
    #	These jobs are very fast and very I/O intensive, even on the san
    #	they will hang it up as they work at full tilt.
# Completed: 366 of 366 jobs
# CPU time in finished jobs:      29300s     488.33m     8.14h    0.34d  0.001 y
# IO & Wait Time:                 19643s     327.39m     5.46h    0.23d  0.001 y
# Average job time:                 134s       2.23m     0.04h    0.00d
# Longest finished job:             247s       4.12m     0.07h    0.00d
# Submission to last job:          1880s      31.33m     0.52h    0.02d

    # combine predictions and transform scores to be in 0-1000 interval
    #	it uses a lot of memory, so on kolossus:
    ssh kolossus
    cd /san/sanvol1/scratch/monDom4/cons/consRun1
    #	The sed's and the sort get the file names in chrom,start order
    find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
	| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
	| /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    #	~ 1 minute
    cp -p mostConserved.bed /cluster/data/monDom4/bed/multiz7way

    # Figure out how much is actually covered by the bed files as so:
    #	The 2583393846 comes from the non-n genome size,
    #	from faSize on all chroms:
    ssh kkstore04
    cd /cluster/data/monDom4
    faSize ?{,?}/chr*.fa
    #	3605614649 bases (103971429 N's 3501643220 real 1551686450 upper
    #	1949956770 lower) in 10 sequences in 10 files

    cd /san/sanvol1/scratch/monDom4/cons/consRun1
    awk '
{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/3501643220\n",100.0*sum/3501643220,sum}' \
	mostConserved.bed
    #	--expected-lengths 12 --target-coverage 0.17 --quiet \
    #	% 4.80 = 100.0*168149380/3501643220

    ssh kolossus
    cd /san/sanvol1/scratch/monDom4/cons/consRun1
    #	Aiming for %70 coverage in
    #	the following featureBits measurement on CDS:
    # Beware of negative scores when too high.  The logToBedScore
    # will output an error on any negative scores.

    #	I don't think this will work for opossum because we don't have
    #	enough refGene:cds coverage:
    time nice -n +19 featureBits monDom4 refGene:cds
    #	34857 bases of 3501643220 (0.001%) in intersection

    HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits monDom4 \
	-enrichment refGene:cds mostConserved.bed
    #	--expected-lengths 12 --target-coverage 0.17 --quiet \
    #	refGene:cds 0.001%, mostConserved.bed 4.802%, both 0.001%, cover
    #	75.13%, enrich 15.65x

    # Load most conserved track into database
    ssh hgwdev
    cd /cluster/data/monDom4/bed/multiz7way
    time nice -n +19 hgLoadBed -strict monDom4 \
	phastConsElements7way mostConserved.bed
    #	real    2m31.673s
    #	Loaded 1882640 elements of size 5

    #	should measure the same as above
    time nice -n +19 featureBits monDom4 -enrichment refGene:cds \
	phastConsElements7way
    #	refGene:cds 0.001%, phastConsElements7way 4.802%, both 0.001%,
    #	cover 75.13%, enrich 15.65x

    # Create merged posterier probability file and wiggle track data files
    ssh kkstore04
    cd /san/sanvol1/scratch/monDom4/cons/consRun1
    # the sed business gets the names sorted by chromName, chromStart
    #	so that everything goes in numerical order into wigEncode
    #	You will want to test this to make sure it is sorting correctly,
    #	run this command and see if everything is in order by chrom and
    #	position:
    find ./ppRaw -type f \
	| sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | less

    time nice -n +19 find ./ppRaw -type f \
	| sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
	    | $HOME/bin/$MACHTYPE/wigEncode -noOverlap stdin \
		phastCons7.wig phastCons7.wib
    #	Converted stdin, upper limit 1.00, lower limit 0.00
    #	real    4m4.694s

    #	-rw-rw-r--    1 465425363 May  2 12:42 phastCons7.wib
    #	-rw-rw-r--    1  79633105 May  2 12:42 phastCons7.wig

    cp -p phastCons7.wi? /cluster/data/monDom4/bed/multiz7way/

    #	prepare compressed copy of ascii data values for downloads
    ssh kkstore04
    cd /san/sanvol1/scratch/monDom4/cons/consRun1
    cat << '_EOF_' > gzipAscii.sh
#!/bin/sh

TOP=`pwd`
export TOP

mkdir -p phastCons7Scores

for D in ppRaw/chr*
do
    C=${D/ppRaw\/}
    out=phastCons7Scores/${C}.data.gz
    echo "========================== ${C} ${D}"
    find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
	    gzip > ${out}
done
'_EOF_'
    #	<< happy emacs
    chmod +x gzipAscii.sh
    time nice -n +19 ./gzipAscii.sh
    #	real    8m10.959s

    #	copy them for downloads
    ssh kkstore04
    mkdir /cluster/data/monDom4/bed/multiz7way/phastCons7Scores
    cd /cluster/data/monDom4/bed/multiz7way/phastCons7Scores
    cp -p  /san/sanvol1/scratch/monDom4/cons/consRun1/phastCons7Scores/* .

    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/monDom4
    ln -s /cluster/data/monDom4/bed/multiz7way/phastCons7Scores .

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/monDom4/bed/multiz7way
    ln -s `pwd`/phastCons7.wib /gbdb/monDom4/wib/phastCons7.wib
    time nice -n +19 hgLoadWiggle monDom4 phastCons7 phastCons7.wig
    #	real    0m52.109s

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/monDom4/bed/multiz7way
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=monDom4 phastCons7 > histogram.data 2>&1
    #	real    9m8.075s
    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
        x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Opossum MonDom4 Histogram phastCons7 track"
set xlabel " phastCons7 score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

#########################################################################
# Build maf annotation for multiz7way  (DONE - 2006-07-31 - Hiram)

    ssh kolossus
    cd /cluster/data/monDom4/bed/multiz7way
    mkdir anno 
    cd anno
    mkdir maf run
    cd run
    rm sizes nBeds

    for i in `cat /cluster/data/monDom4/bed/multiz7way/species.lst`
do
    ln -s  /cluster/data/$i/chrom.sizes $i.len
    ln -s  /cluster/data/$i/$i.N.bed $i.bed
    echo $i.bed  >> nBeds
    echo $i.len  >> sizes
done

    echo "#!/bin/csh -fe" > jobs.csh
    echo date >> jobs.csh
    #	do smaller jobs first, the 'ls -rS' lists in reverse order by size
    for i in `ls -rS ../../maf/*.maf`
do
echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $i /cluster/data/monDom4/monDom4.2bit ../maf/`basename $i` >> jobs.csh
echo "echo $i" >> jobs.csh
done 
    echo date >> jobs.csh

    chmod +x jobs.csh
    
    #	establish a screen for this long running job
    screen
    cd /cluster/data/monDom4/bed/multiz7way
    time ./jobs.csh > jobs.log 2>&1
    #	real    195m54.071s
    #	user    57m51.373s
    #	sys     128m25.386s
    #	ctrl-a ctrl-d to release
    #	to return to the screen:
    screen -r -d


    cd /cluster/data/monDom4/bed/multiz7way/anno/maf
    mkdir -p /gbdb/monDom4/multiz7way/anno/maf
    ln -s /cluster/data/monDom4/bed/multiz7way/anno/maf/*.maf \
        /gbdb/monDom4/multiz7way/anno/maf

    time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/monDom4/multiz7way/anno/maf \
                monDom4 multiz7way
# Loaded 2781002 mafs in 10 files from /gbdb/monDom4/multiz7way/anno/maf
# real    1m49.236s

    time cat *.maf \
	| nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
		-maxSize=50000 monDom4 multiz7waySummary stdin
# Created 1423996 summary blocks from 5463968 components and 2781002 mafs
#	from stdin
#	real    2m30.875s

#######################################################################
# MULTIZ7WAY MAF FRAMES (DONE - 2006-08-01 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/monDom4/bed/multiz7way/frames
    cd /cluster/data/monDom4/bed/multiz7way/frames
    # The following is adapted from the sequence in mm8.txt

    #------------------------------------------------------------------------
    # get the genes for all genomes
    # mRNAs with CDS.  single select to get cds+psl, then split that up and
    # create genePred
    # using mrna table as genes:
    mkdir genes

    for qDB in danRer3
    do
      tmpExt=`mktemp temp.XXXXXX`
      tmpMrnaCds=${qDB}.mrna-cds.${tmpExt}
      tmpMrna=${qDB}.mrna.${tmpExt}
      tmpCds=${qDB}.cds.${tmpExt}
      echo $qDB
      hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
                   from all_mrna,gbCdnaInfo,cds \
                   where (all_mrna.qName = gbCdnaInfo.acc) and \
                     (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
       ${qDB} > ${tmpMrnaCds}
      cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
      cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
      mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \
        stdout \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/$qDB.tmp.gz
      rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
      mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
      rm -f $tmpExt
    done

    # using knownGene for rn4 mm8 hg18
    # using refGene for galGal2
    # using mgcGenes for xenTro2
    # using ensGene for monDom4
    # genePreds; (must keep only the first 10 columns for knownGene)
    for qDB in rn4 mm8 hg18 galGal2 xenTro2 monDom4
    do
      if [ $qDB = "xenTro2" ]; then
        geneTbl=mgcGenes
      elif [ $qDB = "galGal2" ]; then
        geneTbl=refGene
      elif [ $qDB = "monDom4" ]; then
        geneTbl=ensGene
      else
        geneTbl=knownGene
      fi
      echo hgsql -N -e 'select * from '"$geneTbl ${qDB}"
      hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 1-10 \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/$qDB.tmp.gz
      mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
      rm -f $tmpExt
    done

    #------------------------------------------------------------------------
    # create frames
    ssh kkstore04
    cd /cluster/data/monDom4/bed/multiz7way/frames
    rm mkMafFrames.sh
    for C in `awk '{print $1;}' /cluster/data/monDom4/chrom.sizes`
do
    echo "echo mafFrames/${C}.maf"
    echo genePredToMafFrames monDom4 ../mafDownloads/${C}.maf.gz \
	mafFrames/${C}.maf \
	monDom4 genes/monDom4.gp.gz \
	hg18 genes/hg18.gp.gz \
	mm8 genes/mm8.gp.gz \
	rn4 genes/rn4.gp.gz \
	galGal2 genes/galGal2.gp.gz \
	danRer3 genes/danRer3.gp.gz \
	xenTro2 genes/xenTro2.gp.gz
done > mkMafFrames.sh

    chmod +x mkMafFrames.sh
    time nice -n +19 ./mkMafFrames.sh
    #	real    1m10.711s
    gzip mafFrames/*.maf


    #------------------------------------------------------------------------
    # load the database
    ssh hgwdev
    cd /cluster/data/monDom4/bed/multiz7way/frames
    time nice -n +19 hgLoadMafFrames monDom4 multiz7wayFrames \
	mafFrames/*.maf.gz
    #	real    0m17.864s

    #	The use of this table requires an entry in the trackDb.ra for this:
    #	frames multiz7wayFrames
    #	irows on

###########################################################################
# BLASTZ SWAP OF ZEBRAFISH (danRer4) CHAIN AND NET ALIGNMENTS OVER TO 
# monDom4 AND CREATE AXNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS 
# (DONE, 2006-04-29, hartera)
    # see also makeDanRer4.doc
    # Remove test tables and directories and start again:
    ssh hgwdev
    cd /cluster/data/monDom4
    foreach c (`cat chrom.lst`)
       hgsql -e "drop table chr${c}_chainDanRer4;" monDom4
       hgsql -e "drop table chr${c}_chainDanRer4Link;" monDom4
       hgsql -e "drop table chr${c}_chainDanRer4NoDyMsk;" monDom4
       hgsql -e "drop table chr${c}_chainDanRer4NoDyMskLink;" monDom4
    end
    hgsql -e "drop table netDanRer4;" monDom4
    hgsql -e "drop table netDanRer4NoDyMsk;" monDom4
    # remove downloads
    rm -r /usr/local/apache/htdocs/goldenPath/monDom4/vsDanRer4
    rm \
    /usr/local/apache/htdocs/goldenPath/monDom4/liftOver/monDom4ToDanRer4.over.chain.gz
    rm /cluster/data/monDom4/bed/liftOver/monDom4ToDanRer4.over.chain.gz
    # remove old Blastz swap
    rm -r /cluster/data/monDom4/bed/blastz.danRer4.swap
    # remove link to old blastz directory
    rm -r /cluster/data/monDom4/bed/blastz.danRer4

    # Used the following BLASTZ parameters:
    # BLASTZ_H=2000
    # BLASTZ_Y=3400
    # BLASTZ_L=6000
    # BLASTZ_K=2200
    # BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
    # Used all repeats as lineage-specific repeats.
    # Swap directory is: /cluster/data/monDom4/bed/blastz.danRer4.swap
    ssh pk
    cd /cluster/data/danRer4/bed/blastz.monDom4.2006-04-28
    nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        -swap `pwd`/DEF >& doSwap.log &
    
    featureBits monDom4 chainDanRer4Link 
    # 62249730 bases of 3501643220 (1.778%) in intersection
    featureBits monDom4 chainDanRer3Link 
    # 73187496 bases of 3501643220 (2.090%) in intersection
    featureBits monDom4 netDanRer4
    # 745122163 bases of 3501643220 (21.279%) in intersection
    featureBits monDom4 netDanRer3
    # 672446285 bases of 3501643220 (19.204%) in intersection
    featureBits -chrom=chr1 monDom4 refGene:cds chainDanRer4Link -enrichment
    # refGene:cds 0.001%, chainDanRer4Link 1.807%, both 0.000%, 
    # cover 43.85%, enrich 24.27x 
    featureBits -chrom=chr1 monDom4 refGene:cds chainDanRer3Link -enrichment
    # refGene:cds 0.001%, chainDanRer3Link 2.044%, both 0.000%, 
    # cover 49.24%, enrich 24.09x

    # Only 36 RefSeqs in monDom4 refGene so try xenoRefGene (78707 sequences)
    # and mrna (174 sequences):
    featureBits -chrom=chr1 monDom4 xenoRefGene:cds chainDanRer4Link -enrichment
    # xenoRefGene:cds 0.820%, chainDanRer4Link 1.807%, both 0.661%, 
    # cover 80.63%, enrich 44.63x
    featureBits -chrom=chr1 monDom4 xenoRefGene:cds chainDanRer3Link -enrichment
    # xenoRefGene:cds 0.820%, chainDanRer3Link 2.044%, both 0.577%, 
    # cover 70.45%, enrich 34.47x
    featureBits -chrom=chr1 monDom4 mrna chainDanRer4Link -enrichment
    # mrna 0.004%, chainDanRer4Link 1.807%, both 0.002%, 
    # cover 52.67%, enrich 29.15x
    featureBits -chrom=chr1 monDom4 mrna chainDanRer3Link -enrichment
    # mrna 0.004%, chainDanRer3Link 2.044%, both 0.002%, 
    # cover 54.50%, enrich 26.66x
 
    featureBits -chrom=chr1 monDom4 xenoRefGene:cds netDanRer4 -enrichment
    # xenoRefGene:cds 0.820%, netDanRer4 24.539%, both 0.720%, 
    # cover 87.79%, enrich 3.58x
    featureBits -chrom=chr1 monDom4 xenoRefGene:cds netDanRer3 -enrichment
    # xenoRefGene:cds 0.820%, netDanRer3 22.272%, both 0.640%, 
    # cover 78.12%, enrich 3.51x
    featureBits -chrom=chr1 monDom4 mrna netDanRer4 -enrichment
    # mrna 0.004%, netDanRer4 24.539%, both 0.002%, cover 56.90%, enrich 2.32x
    featureBits -chrom=chr1 monDom4 mrna netDanRer3 -enrichment
    # mrna 0.004%, netDanRer3 22.272%, both 0.002%, cover 62.62%, enrich 2.81x

##########################################################################
# PRODUCING GENSCAN PREDICTIONS (WORKING - 2006-05-02 - Hiram)
    #	Want to make up a chrom and contig 2bit file to get proper
    #	names on the genscan predictions.
    ssh kkstore04
    cd /cluster/data/monDom4/Un
    grep chrUn ../broad.mit.edu/Monodelphis4.0.agp > chrUn.broad.agp
    cat chrUn.broad.agp | /cluster/bin/scripts/agpToLift > chrUn_random.lft
    $HOME/kent/src/hg/utils/lft2BitToFa.pl ../monDom4.2bit chrUn_random.lft \
	> chrUn_random.ctg.fa

    #	Create a 2bit file with the full chrom sequences and the
    #	random contigs, all hard masked
    cat ?/chr?.fa Un/chrUn_random.ctg.fa \
	| maskOutFa stdin hard stdout \
	    | faToTwoBit stdin monDom4Chroms_RandomContigs.hard.2bit
    #	make sure it still has all the unmasked sequence in it:
    twoBitToFa monDom4Chroms_RandomContigs.hard.2bit stdout \
	| faSize stdin
    #	3589973501 bases (2038287051 N's 1551686450 real 1551686450
    #	upper 0 lower) in 9949 sequences in 1 files
    twoBitToFa monDom4.2bit stdout | faSize stdin
    #	3605614649 bases (103971429 N's 3501643220 real 1551686450 upper
    #	1949956770 lower) in 10 sequences in 1 files

    #	note the 'real' bases are the same, the lowers have become N's
    #	1089350685 + 97171400 = 1186522085 
    #	1949956770 + 103971429 = 2053928199
    #	2053928199 - 2038287051  = 15641148 == N's in gaps between contigs

    #	There are a lot of contigs that became completely N's with no
    #	sequence left.  genscan doesn't like that.  We need to get them
    #	out of the analysis.  Pick out the ones with over 18 real bases
    #	left:
    twoBitToFa monDom4Chroms_RandomContigs.hard.2bit stdout \
	| faCount stdin > chroms_randoms.faCount
    egrep -v "^#|^total" chroms_randoms.faCount \
	| awk '{if ($2-$7 > 17) {print $1}}' > over18.lst
    #	creating 4,000,000 sized chunks
    mkdir hardChunks
    twoBitToFa monDom4Chroms_RandomContigs.hard.2bit stdout \
	| faSomeRecords /dev/stdin over18.lst stdout \
	    | faSplit about stdin 4000000 hardChunks/c_
    rsync -a --progress hardChunks/ /cluster/bluearc/monDom4/hardChunks/


    ssh hgwdev
    mkdir /cluster/data/monDom4/bed/genscan
    cd /cluster/data/monDom4/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux

    # Run on pk cluster instead of kki since these jobs are going to
    # take quite a while and we don't want to tie up the kki kluster
    # entirely.
    ssh pk
    cd /cluster/data/monDom4/bed/genscan
    # Make 3 subdirectories for genscan to place output files
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    #	Since we split on gaps, we have no chunks like that.  You can
    #	verify with faCount on the chunks.
    ls -1S /cluster/bluearc/monDom4/hardChunks/c_*.fa > genome.list

    # Create template file, for gensub2.  For example (3-line file):
    cat << '_EOF_' > template
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << emacs
    gensub2 genome.list single template jobList
    para create jobList
    para try, check, push, check, ...
XXXX - running 2006-05-04 14:13
# Completed: 906 of 906 jobs
# CPU time in finished jobs:     215764s    3596.07m    59.93h    2.50d  0.007 y
# IO & Wait Time:                  2798s      46.63m     0.78h    0.03d  0.000 y
# Average job time:                 241s       4.02m     0.07h    0.00d
# Longest finished job:           57335s     955.58m    15.93h    0.66d
# Submission to last job:         63345s    1055.75m    17.60h    0.73d

    cat gtf/c_*.gtf | liftUp -type=.gtf t.gtf \
	/cluster/data/monDom4/Un/chrUn_random.lft carry stdin

    # Concatenate and lift to chrom results:
    ssh kkstore04
    cd /cluster/data/monDom4/bed/genscan
    for C in `awk '{print $1}' ../../chrom.sizes | sed -e "s/chr//"`
    do
	mkdir -p ${C}
	catDir gtf | liftUp ${C}/chr${C}.gtf \
	    /cluster/data/monDom4/hardChunks.lft error stdin
	catDir subopt | liftUp ${C}/chr${C}.bed \
	    /cluster/data/monDom4/hardChunks.lft error stdin
	catDir pep > ${C}/chr${C}.pep
	echo "done: chr${C}"
    done

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/monDom4/bed/genscan
    ldHgGene -bin -gtf monDom4 genscan 1/chr1.gtf
    #	35103 gene predictions
    hgPepPred monDom4 generic genscanPep 1/chr1.pep
    hgLoadBed monDom4 genscanSubopt 1/chr1.bed
    #	Loaded 446953 elements of size 6

    #	Compare with other assemblies:
    featureBits monDom4 genscan
    #	48628209 bases of 3496445653 (1.391%) in intersection
    featureBits monDom1 genscan
    #	47170795 bases of 3492108230 (1.351%) in intersection
    featureBits monDom4 genscanSubopt
    #	52714123 bases of 3496445653 (1.508%) in intersection
    featureBits monDom1 genscanSubopt
    #	51805835 bases of 3492108230 (1.484%) in intersection
    featureBits hg17 genscan
    #	55323340 bases of 2866216770 (1.930%) in intersection
    featureBits hg17 genscanSubopt
    #	55986178 bases of 2866216770 (1.953%) in intersection

    # Clean up
    cd /san/sanvol1/scratch/monDom4
    rm -fr hardMasked200K
    #	This kolossus clean up was actually left until much later.
    #	I found these copies here to be useful later on.
    ssh kolossus
    cd /scratch/monDom4
    rm -f *
    cd ..
    rmdir monDom4

#########################################################################
#  creating bigZips downloads (DONE - 2006-05-15 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/monDom4/downloads
    mkdir /cluster/data/monDom4/downloads/bigZips
    mkdir /cluster/data/monDom4/downloads/chromosomes
    cd /cluster/data/monDom4/downloads/chromosomes
    #	There wasn't a README for a previous assembly, these things are
    #	pretty generic, the mm8 one can be used with only slight edits.
    #	Next time use the previous Opossum assembly README.
    cp /usr/local/apache/htdocs/goldenPath/mm8/chromosomes/README.txt .
    #	edit that readme to provide correct references and details
    ln -s `pwd` /usr/local/apache/htdocs/goldenPath/monDom4/chromosomes

    ssh kkstore04
    cd /cluster/data/monDom4/downloads/chromosomes
    for C in 1 2 3 4 5 6 7 8 X Un
    do
	echo "gzip -c ../../${C}/chr${C}.fa > chr${C}.fa.gz"
	gzip -c ../../${C}/chr${C}.fa > chr${C}.fa.gz
    done
    md5sum *.fa.gz R*.txt > md5sum.txt

    cd ../bigZips
    gzip -c ../../broad.mit.edu/Monodelphis4.0.agp > Monodelphis4.0.agp.gz
    cd ../..
    tar cvzf downloads/bigZips/chromFa.tar.gz ?/chr*.fa Un/chrUn.fa
    #	16 minutes
    tar cvzf downloads/bigZips/chromFaMasked.tar.gz ?/chr*.fa.masked \
	Un/chrUn.fa.masked
    #	8 minutes
    tar cvzf downloads/bigZips/chromOut.tar.gz ?/chr*.fa.out Un/chrUn.fa.out
    cd bed/simpleRepeat
    mkdir trfMask
    cd trfMask
    for C in 1 2 3 4 5 6 7 8 X Un
    do
	ln -s ../chromMask/chr${C}_Mask.bed chr${C}.bed
    done
    tar --dereference cvzf ../../downloads/bigZips/chromTrf.tar.gz ./trfMask

    ssh hgwdev
    cd /cluster/data/monDom4/downloads/bigZips
    cp /usr/local/apache/htdocs/goldenPath/mm8/bigZips/README.txt .

    ln -s /cluster/data/monDom4/downloads/bigZips \
	/usr/local/apache/htdocs/goldenPath/monDom4/bigZips
    cd /usr/local/apache/htdocs/goldenPath/monDom4/bigZips
    ln -s /cluster/data/monDom4/monDom4.2bit .

    ssh kkstore04
    cd /cluster/data/monDom4/downloads/bigZips
    md5sum *.gz *.2bit R*.txt > md5sum.txt

    
#########################################################################
# BLASTZ CHICKEN galGal3 (DONE 5/26/06 angie)
    ssh pk
    mkdir /cluster/data/monDom4/bed/blastzGalGal3.2006-05-25
    cd /cluster/data/monDom4/bed/blastzGalGal3.2006-05-25
    cat << '_EOF_' > DEF
# opossum vs chicken
BLASTZ=blastz.v7.x86_64

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Opossum monDom4
SEQ1_DIR=/san/sanvol1/scratch/monDom4/nib
SEQ1_SMSK=/san/sanvol1/scratch/monDom4/linSpecRep.notInOthers
SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
SEQ2_DIR=/san/sanvol1/galGal3/nib
SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
SEQ2_SMSK=/san/sanvol1/galGal3/linSpecRep
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/cluster/data/monDom4/bed/blastzGalGal3.2006-05-25
'_EOF_'
    # << emacs
    doBlastzChainNet.pl DEF -blastzOutRoot /san/sanvol1/scratch/gg3vsmonDom4 \
      -bigClusterHub=pk -smallClusterHub=pk \
      -chainMinScore=5000 -chainLinearGap=loose \
      >& do.log & tail -f do.log
    ln -s blastzGalGal3.2006-05-25 /cluster/data/monDom4/bed/blastz.galGal3


#####################################################################
#### LOAD ENSEMBL GENES (DONE - 2006-06-21 - Hiram)
   mkdir /cluster/data/monDom4/bed/ensGene
   cd /cluster/data/monDom4/bed/ensGene

        Get the Ensembl BioMart at http://www.ensembl.org/Multi/martview
        Choose Ensembl 39 and Monodelphis domestica, click next
	It displays status in a window on the right, indicating how many
	entries are here, currently: 20,907
    	The next page is the "filter" step, we do not want any filters,
	nothing is changed on this page, click next
	Now we are on the "output" tab, the filter in the window on the right
	indicates that 20,907 passed the filter.  (there is no filter)
	Now, on this output page, change the pull-down menu item from
	its default of "features" to read "structures"
	All the check-boxes now change.
	Mark the check box GTF under output format
	Under Gene Ensemble Attributes,
	Unselect Biotype
	Select
	Ensembl Gene ID
	Ensembl Transcript ID
	External Gene ID

	gzip compression
	and give it a filename: ensGeneMonDom4
	it will add the .gff.gz suffix
	press "export"


    #	They have a chrMT - we don't have the chrM sequence, 
    # Add "chr" to front of each line
    #	 in the gene data gtf file to make 
    # it compatible with ldHgGene and convert remove chrMT name

    zcat ensGeneMonDom4.gff.gz \
	| sed -e "s/^\([0-9XYU][0-9n]*\)/chr\1/; /^MT/d" > ensGene.gtf
    #	Verify names converted OK:
    awk '{print $1}' ensGene.gtf | sort | uniq -c
    #	153247 chr1
    #	126767 chr2
    #	 82637 chr3
    #	 90199 chr4
    #	 47661 chr5
    #	 50469 chr6
    #	 34230 chr7
    #	 55841 chr8
    #	 41868 chrUn
    #	  9965 chrX

    ldHgGene -gtf monDom4 ensGene ensGene.gtf
# Read 33878 transcripts in 692884 lines in 1 files
#   33878 groups 10 seqs 1 sources 4 feature types
# 33878 gene predictions

    featureBits monDom4 ensGene
    #	32770559 bases of 3501643220 (0.936%) in intersection

    #	Let's see if the peptides created from the genome sequence are
    #	different than the peptides that Ensemble says they are making:

    #	fetch chrX peptides from the ensGene table, sort them by their
    #	name into single line instances:
    getRnaPred -peptides monDom4 ensGene chrX stdout \
	| faToTab -type=protein stdin stdout | sort > ensGenePep.chrX.fa
    #	And chrX peptides downloaded from ensmart:
    faToTab -type=protein ensPepMonDom4chrX.fasta.gz stdout \
	| sort > ensPepchrX.fa
    #	A diff of those two show they are different.  But this is not
    #	correct yet because they don't even have the same names in them
    #	need to try some other options here to get the same sequence


    #	It turns out it is necessary to have the ensGtp table to enable a link
    #	to Ensemble from a gene's details page

    # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
    # hgKnownToSuper.  Use ensMart to create it as above, except:
    # Page 3) Choose the "Features" box. In "Ensembl Attributes", check 
    # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
    # Choose Text, tab-separated as the output format.  Result name ensGtp.
    # Save file as monDom4.ensGtp.txt.gz
    gunzip ensGtp.txt.gz
    hgsql monDom4 < ~/kent/src/hg/lib/ensGtp.sql
    # remove header line from ensGtp.txt
    zcat monDom4.ensGtp.txt.gz \
	| hgsql monDom4 \
-e "load data local infile '/dev/stdin' into table ensGtp ignore 1 lines"

    #	Obtaining protein sequence from EnsMart
    #	Select "sequences" from the pull-down on the output page
    #	check Peptide in the "Sequences" selection area
    #	and "Ensembl Transcript ID (versioned) in the Transcript
    #	Attributes area
    #	Uncheck all items in the Gene Attributes section.
    #	Text,Fasta output, gzip, file name: monDom4.ensPep
    #	becomes monDom4.ensPep.fasta.gz
    #	The result should be just an ID and a peptide, e.g.:
# >ENSMODT00000024417.2
# MSEFWLCFNCCIAEQPQPKRRRRIDRSMIGEPTNFVHTAHVGSGDLFSGMNSVSSIQSQMQSKGGYGGGMPSNGQMQLVETKAG
    #
    #	 monDom4.ensPep.fa.gz
    zcat monDom4.ensPep.fa.gz | \
	$HOME/kent/src/utils/faToTab/faToTab.pl  /dev/null /dev/stdin \
	> ensGene.fa.tab
    #	There appear to be more peptides than there were genes before.
    #	I wonder if Ensembl has updated this list since the gene data was
    #	fetched before:
    awk '{print $1}' genePred.tab | sort -u > geneNameList
    awk '{print $1}' ensGene.fa.tab | sort -u > pepNameList
    wc geneNameList pepNameList
    #	33878   33878  711438 geneNameList
    #	33915   33914  712193 pepNameList
    #	67793   67792 1423631 total
    comm -12 geneNameList pepNameList | wc
    #	33878   33878  711438
    #	So create an exclude list to get rid of the extras:
    comm -13 geneNameList pepNameList  > excludeList
    #	and leave them out
    zcat monDom4.ensPep.fa.gz | \
	$HOME/kent/src/utils/faToTab/faToTab.pl excludeList /dev/stdin \
	> ensGene.fa.tab

    hgPepPred monDom4 tab ensPep ensGene.fa.tab

##########################################################################
# N-SCAN gene predictions (nscanGene, nscanPep) - (2006-09-13 markd)
    cd /cluster/data/monDom4/bed/nscan/

    # obtained NSCAN predictions from michael brent's group
    # at WUSTL
    wget -nv -r -l 1 -np --accept=gtf http://ardor.wustl.edu/predictions/possum/monDom4/chr_gtf
    wget -nv -r -l 1 -np --accept=fa http://ardor.wustl.edu/predictions/possum/monDom4/chr_ptx

    # clean up downloaded directorys and compress files.
    mv ardor.wustl.edu/predictions/opossum/monDom4/* .
    rm -rf ardor.wustl.edu
    gzip chr_*/*
    chmod a-w chr_*/*.gz

    # load tracks.  Note that these have *utr features, rather than
    # exon features.  currently ldHgGene creates separate genePred exons
    # for these.
    ldHgGene -bin -gtf -genePredExt monDom4 nscanGene chr_gtf/chr*.gtf.gz

    # load protein, add .a suffix to match transcript id
    hgPepPred -suffix=.a monDom4 generic nscanPep chr_ptx/chr*.fa.gz
    rm *.tab

    # update trackDb; need a monDom4-specific page to describe informants
    opossum/monDom4/nscanGene.html   (copy from rn4 and edit)
    opossum/monDom4/trackDb.ra

    # verify that the search spec matches the naming convention, sometimes
    # transcripts end with .a othertimes .0
    OK!


#########################################################################
# BLASTZ PLATYPUS ornAna1 (DONE 4/3/07 angie)
    ssh pk
    mkdir /cluster/data/monDom4/bed/blastz.ornAna1.2007-04-02
    cd /cluster/data/monDom4/bed/blastz.ornAna1.2007-04-02
    cat << '_EOF_' > DEF
# opossum vs platypus
BLASTZ=blastz.v7.x86_64

# Use "mammal-fish" params even though this is mammal-mammal... 
# pretty distant, hopefully not too many shared undermasked repeats, 
# and if we get overwhelmed we can back off:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Opossum monDom4
SEQ1_DIR=/san/sanvol1/scratch/monDom4/nib
SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Platypus ornAna1
SEQ2_DIR=/san/sanvol1/scratch/ornAna1/ornAna1.2bit
SEQ2_LEN=/san/sanvol1/scratch/ornAna1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0

BASE=/cluster/data/monDom4/bed/blastz.ornAna1.2007-04-02
'_EOF_'
    # << emacs
    doBlastzChainNet.pl DEF -blastzOutRoot /san/sanvol1/scratch/monDom4ornAna1 \
      -bigClusterHub=pk -smallClusterHub=pk \
      -chainMinScore=5000 -chainLinearGap=loose \
      >& do.log & tail -f do.log
    ln -s blastz.ornAna1.2007-04-02 /cluster/data/monDom4/bed/blastz.ornAna1

#########################################################################
# BLASTZ Rhesus rheMac2 (DONE 2007-05-27 braney)
    ssh kolossus
    mkdir /cluster/data/monDom4/bed/blastz.rheMac2
    cd /cluster/data/monDom4/bed/blastz.rheMac2
    cat << '_EOF_' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

# settings for more distant organism alignments
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
    
# TARGET: Opossum monDom4
SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Rhesus (rheMac2)
SEQ2_DIR=/san/sanvol1/scratch/rheMac2/nib
SEQ2_LEN=/san/sanvol1/scratch/rheMac2/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/monDom4/bed/blastz.rheMac2
TMPDIR=/scratch/tmp
    # << emacs

    doBlastzChainNet.pl DEF \
      -bigClusterHub=pk  -chainMinScore=5000 -chainLinearGap=loose \
      >& blastz.out 2>&1 &

############################################################################
##  BLASTZ swap from canFam2 alignments (2007-11-10 - markd)
    # /cluster/data/canFam2/bed/blastz.monDom4/DEF1 has incorrect BASE, so
    # create DEF.fixed
    ssh hgwdev
    mkdir /cluster/data/monDom4/bed/blastz.canFam2.swap
    cd /cluster/data/monDom4/bed/blastz.canFam2.swap
    ln -s blastz.canFam2.swap ../blastz.canFam2

    (time /cluster/bin/scripts/doBlastzChainNet.pl \
        -swap /cluster/data/canFam2/bed/blastz.monDom4/DEF.fixed) >& swap.out&

    nice -n +19 featureBits monDom4 chainCanFam2Link
    # 325604035 bases of 3501643220 (9.299%) in intersection

##############################################################################
# Blastz Orangutan ponAbe2 (DONE - 2007-11-26 - Hiram)
    ssh kkstore04
    #	managing disk space on full filesystem
    mkdir /cluster/store8/monDom4/bed/blastzPonAbe2.2007-11-22
    ln -s /cluster/store8/monDom4/bed/blastzPonAbe2.2007-11-22 \
	/cluster/data/monDom4/bed
    cd /cluster/data/monDom4/bed/blastzPonAbe2.2007-11-22

    screen # use screen to control this job
    mkdir /cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13
    cd /cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13

    cat << '_EOF_' > DEF
# Opossum vs. Orangutan
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_M=50

# TARGET: Opossum MonDom4
SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ1_LEN=/cluster/data/monDom4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=100

# QUERY: Orangutan PonAbe2
SEQ2_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit
SEQ2_LEN=/cluster/data/ponAbe2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ1_LIMIT=100
SEQ2_LAP=0

BASE=/cluster/data/monDom4/bed/blastzPonAbe2.2007-11-22
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    # XXX - can run Opossum only on pk since the chroms are so large
    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-stop=cat -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
	> cat.log 2>&1 &
    #	real    1600m18.602s
# Completed: 134688 of 134688 jobs
# CPU time in finished jobs:   12106044s  201767.41m  3362.79h  140.12d  0.384 y
# IO & Wait Time:                851946s   14199.09m   236.65h    9.86d  0.027 y
# Average job time:                  96s       1.60m     0.03h    0.00d
# Longest finished job:           11205s     186.75m     3.11h    0.13d
# Submission to last job:         95155s    1585.92m    26.43h    1.10d
    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-continue=chainRun -stop=download -chainLinearGap=loose \
	-bigClusterHub=pk -verbose=2 \
	> chainRun.log 2>&1 &
    #	real    2724m19.530s

    cat fb.monDom4.chainPonAbe2Link.txt
    #	395887393 bases of 3501643220 (11.306%) in intersection

    #	and for the swap
    mkdir /cluster/data/ponAbe2/bed/blastz.monDom4.swap
    cd /cluster/data/ponAbe2/bed/blastz.monDom4.swap
    time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 \
	/cluster/data/monDom4/bed/blastzPonAbe2.2007-11-22/DEF \
	-swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
	> swap.log 2>&1 &
    #	real    223m38.325s
    cat fb.ponAbe2.chainMonDom4Link.txt
    #	406409435 bases of 3093572278 (13.137%) in intersection

##############################################################################
# Blastz Marmoset calJac1 (WORKING - 2007-11-24 - Hiram)
    ssh kkstore04
    screen # use screen to control this job
    #	managing disk space on full filesystem
    mkdir /cluster/store8/monDom4/bed/blastzCalJac1.2007-11-24
    ln -s /cluster/store8/monDom4/bed/blastzCalJac1.2007-11-24 \
	/cluster/data/monDom4/bed
    cd /cluster/data/monDom4/bed/blastzCalJac1.2007-11-24

    #	this chunking makes 651,480 jobs
    cat << '_EOF_' > DEF
# Opossum vs. Marmoset
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_M=50

# TARGET: Opossum MonDom4
SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ1_LEN=/cluster/data/monDom4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=100

# QUERY: Marmoset calJac1, largest contig 2,551,648, 49,724 contigs
SEQ2_DIR=/cluster/bluearc/scratch/data/calJac1/calJac1.2bit
SEQ2_LEN=/cluster/data/calJac1/chrom.sizes
SEQ2_CHUNK=2000000
SEQ2_LIMIT=200
SEQ2_LAP=0

BASE=/cluster/data/monDom4/bed/blastzCalJac1.2007-11-24
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    # XXX - can run Opossum only on pk since the chroms are so large
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=5000 \
	-stop=cat -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
	> cat.log 2>&1 &
    #	real 2581m49.853s
# Completed: 651480 of 651480 jobs
# CPU time in finished jobs:   25871528s  431192.14m  7186.54h  299.44d  0.820 y
# IO & Wait Time:               2695828s   44930.46m   748.84h   31.20d  0.085 y
# Average job time:                  44s       0.73m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1804s      30.07m     0.50h    0.02d
# Submission to last job:        146641s    2444.02m    40.73h    1.70d
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=5000 \
	-continue=chainRun -stop=download \
	-chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
	> chainRun.log 2>&1 &
    #	real    1810m38.601s
    # the 10 chaining jobs on kki take about 35 hours for the longest job
# Completed: 10 of 10 jobs
# CPU time in finished jobs:     369079s    6151.32m   102.52h    4.27d  0.012 y
# IO & Wait Time:                   385s       6.41m     0.11h    0.00d  0.000 y
# Average job time:               36946s     615.77m    10.26h    0.43d
# Longest finished job:          103069s    1717.82m    28.63h    1.19d
# Submission to last job:        103069s    1717.82m    28.63h    1.19d

    cat fb.monDom4.chainCalJac1Link.txt
    #	386915334 bases of 3501643220 (11.050%) in intersection

    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=5000 \
	-continue=cleanup \
	-chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
	> cleanup.log 2>&1 &

    mkdir /cluster/data/calJac1/bed/blastz.monDom4.swap
    cd /cluster/data/calJac1/bed/blastz.monDom4.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/monDom4/bed/blastzCalJac1.2007-11-24/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-swap -bigClusterHub=pk > swap.log 2>&1 &
    #	real    498m24.344s
    #	this failed during the load because the chainLink table became
    #	too large.  These were loaded manually with an sql statement to
    #	start the table definition, then a load data local infile using
    #	the .tab files left over from the failed load.  Note the extra
    #	definitions on the chainMonDom4Link table
CREATE TABLE chainMonDom4 (
  bin smallint(5) unsigned NOT NULL default '0',
    score double not null,	# score of chain
    tName varchar(255) not null,	# Target sequence name
    tSize int unsigned not null,	# Target sequence size
    tStart int unsigned not null,	# Alignment start position in target
    tEnd int unsigned not null,	# Alignment end position in target
    qName varchar(255) not null,	# Query sequence name
    qSize int unsigned not null,	# Query sequence size
    qStrand char(1) not null,	# Query strand
    qStart int unsigned not null,	# Alignment start position in query
    qEnd int unsigned not null,	# Alignment end position in query
    id int unsigned not null,	# chain id
              #Indices
  KEY tName (tName(16),bin),
  KEY id (id)
) TYPE=MyISAM;

    time nice -n +19 hgsql -e \
	"load data local infile \"chain.tab\" into table chainMonDom4;" calJac1
    #	real    8m10.273s

CREATE TABLE chainMonDom4Link (
  bin smallint(5) unsigned NOT NULL default 0,
  tName varchar(255) NOT NULL default '',
  tStart int(10) unsigned NOT NULL default 0,
  tEnd int(10) unsigned NOT NULL default 0,
  qStart int(10) unsigned NOT NULL default 0,
  chainId int(10) unsigned NOT NULL default 0,
  KEY tName (tName(16),bin),
  KEY chainId (chainId)
) ENGINE=MyISAM max_rows=160000000 avg_row_length=55 pack_keys=1 CHARSET=latin1;

    time nice -n +19 hgsql -e \
      "load data local infile \"link.tab\" into table chainMonDom4Link;" calJac1
    #	this one took a number of hours

    #	finish the nets and load
    time nice -n +19 netClass -verbose=0 -noAr noClass.net \
	calJac1 monDom4 calJac1.monDom4.net
    #	real    4m18.898s
    time nice -n +19 netFilter -minGap=10 calJac1.monDom4.net \
	| hgLoadNet -verbose=0 calJac1 netMonDom4 stdin
    #	real    0m54.559s
    cd /cluster/data/calJac1/bed/blastz.monDom4.swap
    time nice -n +19 featureBits calJac1 chainMonDom4Link \
	> fb.calJac1.chainMonDom4Link.txt 2>&1
    #	real    25m7.373s
    cat fb.calJac1.chainMonDom4Link.txt
    #	391248654 bases of 2929139385 (13.357%) in intersection

    #	continuing
    cd /cluster/data/calJac1/bed/blastz.monDom4.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/monDom4/bed/blastzCalJac1.2007-11-24/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-continue=download -swap -bigClusterHub=pk > download.log 2>&1 &

###########################################################################
############################################################################
# TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.
############################################################################
