# for emacs: -*- mode: sh; -*-


#	Creating the assembly for Monodelphis domestica
#	South American, Short-tailed Opossum
#	http://www.genome.gov/11510687
#	http://www.genome.gov/12512285

# DOWNLOAD SEQUENCE (DONE - 2004-12-01 - Hiram)

    ssh kksilo

    mkdir -p /cluster/store8/monDom1/broad.mit.edu
    ln -s /cluster/store8/monDom1 /cluster/data/monDom1
    cd /cluster/data/monDom1/broad.mit.edu
    #	Set user name and password in your $HOME/.netrc
    wget --timestamping "ftp://ftp.broad.mit.edu/*"
    #	takes a couple hours
    #	What we have:
    ls -ogrt
    #	total 4071636
    #	-rw-rw-r--    1        3992 Oct 18 16:45 assembly.format
    #	-rw-rw-r--    1  1196876365 Oct 18 16:52 contigs.bases.gz
    #	-rw-rw-r--    1   539154819 Oct 18 17:24 contigs.quals.gz
    #	-rwxrwxr-x    1     4418102 Oct 18 17:52 supercontigs.summary
    #	-rwxrwxrwx    1     3910827 Oct 18 17:52 supercontigs
    #	-rw-rw-r--    1    23299859 Oct 18 17:52 reads.unplaced.gz
    #	-rw-rw-r--    1   645129395 Oct 18 17:52 reads.placed.gz
    #	-rw-rw-r--    1    10757748 Oct 18 17:52 assembly.agp
    #	-rw-rw-r--    1  1199683639 Oct 19 17:15 assembly.agp.fasta.gz
    #	-rw-rw-r--    1   541975023 Oct 19 17:34 assembly.agp.qual.gz
    time gunzip *.gz
    #	That took almost 2 hours on a busy kksilo
    ls -ogrt
    #	total 29983148
    #	-rw-rw-r--    1        3992 Oct 18 16:45 assembly.format
    #	-rw-rw-r--    1  3537338236 Oct 18 16:52 contigs.bases
    #	-rw-rw-r--    1 10472323694 Oct 18 17:24 contigs.quals
    #	-rwxrwxr-x    1     4418102 Oct 18 17:52 supercontigs.summary
    #	-rwxrwxrwx    1     3910827 Oct 18 17:52 supercontigs
    #	-rw-rw-r--    1   146964359 Oct 18 17:52 reads.unplaced
    #	-rw-rw-r--    1  2273830873 Oct 18 17:52 reads.placed
    #	-rw-rw-r--    1    10757748 Oct 18 17:52 assembly.agp
    #	-rw-rw-r--    1  3608989613 Oct 19 17:15 assembly.agp.fasta
    #	-rw-rw-r--    1 10614160078 Oct 19 17:34 assembly.agp.qual

# DATA INTEGRITY VERIFICATION (DONE - Hiram - 2004-12-02)
    ssh kksilo
    mkdir /cluster/data/monDom1/dataCheck
    cd /cluster/data/monDom1/dataCheck
    grep "^>" ../broad.mit.edu/contigs.bases | sed -e "s/^>//" > contigs.names
    #	2 minutes
    #	Check for duplicates, should be same line count
    sort  contigs.names | wc
    #	109065  109065 1415800
    sort -u  contigs.names | wc
    #	109065  109065 1415800

    sort contigs.names | head -3
    #	contig_0
    #	contig_1
    #	contig_10
    sort contigs.names | tail -3
    #	contig_99997
    #	contig_99998
    #	contig_99999
    wc contigs.names
    #	109065  109065 1415800 contigs.names
    grep "^>" ../broad.mit.edu/contigs.quals \
	    | sed -e "s/^>//" > contigs.quals.names
    #	6 minutes
    sum -r contigs*
    #	35350  1383 contigs.names
    #	35350  1383 contigs.quals.names
    awk '{print $6}' ../broad.mit.edu/assembly.agp \
	    | grep contig_ > assembly.agp.contig.names
    #	The assembly agp contig list appears to be the same contig list
    sort -u  contigs.names | sum -r
    #	18868  1383
    sort -u  assembly.agp.contig.names | sum -r
    #	18868  1383
    awk '{print $1}' ../broad.mit.edu/assembly.agp \
	    | sort -u > assembly.agp.scaffold.names
    wc assembly.agp.scaffold.names
    #	19348   19348  279110 assembly.agp.scaffold.names
    head -3 assembly.agp.scaffold.names
    #	scaffold_0
    #	scaffold_1
    #	scaffold_10
    tail -3 assembly.agp.scaffold.names
    #	scaffold_9997
    #	scaffold_9998
    #	scaffold_9999
    grep "^>" ../broad.mit.edu/assembly.agp.fasta | \
	sed -e "s/^>//; s/\..*//" > assembly.fasta.scaffold.names
    #	2 minutes
    #	There are some duplicates in this fasta file:
    sort -u  assembly.fasta.scaffold.names | wc
    #	19348   19348  279110
    wc  assembly.fasta.scaffold.names 
    #	19576   19576  282523 assembly.fasta.scaffold.names
    #	For example:
    grep "^>scaffold_13303" ../broad.mit.edu/assembly.agp.fasta
    #	>scaffold_13303.1-5000000 (MonodelphisPreliminaryAssemblyOct04)
    #	>scaffold_13303.5000001-10000000 (MonodelphisPreliminaryAssemblyOct04)
    #	>scaffold_13303.10000001-15000000 (MonodelphisPreliminaryAssemblyOct04)
    #	>scaffold_13303.15000001-20000000 (MonodelphisPreliminaryAssemblyOct04)
    #	>scaffold_13303.20000001-22286839 (MonodelphisPreliminaryAssemblyOct04)
    faCount ../broad.mit.edu/assembly.agp.fasta > faCount.assembly.agp.fasta
    #   seq          len               A              C               G
    # total   3563247205      1085390300      660684418       660336426
    #              T             N             cpg
    #     1085697086      71138975        16821709
    grep scaffold_ faCount.assembly.agp.fasta > scaffolds.faCount
    #	Taking a look at those counts, the smallest scaffold is 1000
    #	bases.  The largest ones are broken into 5,000,000 base chunks
    #	as evidenced by the duplicate example above.
    faCount ../broad.mit.edu/contigs.bases > faCount.contigs
    #   seq          len               A              C               G
    # total   3492108230      1085390300      660684418       660336426
    #              T             N             cpg
    #     1085697086             0        16821709
    grep contig_ faCount.contigs > contigs.faCount
    #	The contigs range in size from 85 bases to 658,080
    #	The bulk of them start at a size of 1000

# COMBINE scaffold fasta pieces into single chunks, remove the extra
#	base-range info from the names. (DONE - Hiram - 2004-12-02)
    ssh kksilo
    cd /cluster/data/monDom1/broad.mit.edu
    cat << '_EOF_' collapseScaffolds.sh
#!/bin/sh
#
awk '
BEGIN { name="" }
/^>/ {
    id=$1
    sub(">","",id)
    sub("\\..*","",id)
    if (!match(name,id)) {
        printf ">%s\n", id
        name=id
        }
}
/^[^>]/ { print }
' assembly.agp.fasta
'_EOF_'
    # << this line keeps emacs coloring happy

    chmod +x collapseScaffolds.sh
    time ./collapseScaffolds.sh > scaffolds.fa
    #	11 minutes
    # make sure we have not damaged the sequence:
    time faCount scaffolds.fa | tail -1
    #	4 minutes
    # total   3563247205      1085390300      660684418       660336426
    #      1085697086      71138975        16821709
    #	Those are the same numbers as above

# MAKE 2BIT NIB FILE (DONE - Hiram - 2004-12-02)
    ssh kksilo
    cd /cluster/data/monDom1
    faToTwoBit broad.mit.edu/scaffolds.fa monDom1.2bit
    #	check that the sequence hasn't been damaged
    twoBitToFa monDom1.2bit stdout | faCount stdin | tail -1
    # total   3563247205      1085390300      660684418       660336426
    #      1085697086      71138975        16821709
    #	still the same numbers

# CREATE DATABASE (DONE - Hiram - 2004-12-02)

    ssh hgwdev
    cd /cluster/data/monDom1
    mkdir /gbdb/monDom1
    ln -s /cluster/data/monDom1/monDom1.2bit /gbdb/monDom1
    twoBitInfo monDom1.2bit stdout |
	awk '{printf "%s\t%s\t/gbdb/monDom1/monDom1.2bit\n", $1,$2}' \
	> chromInfo.tab

    hgsql -e "create database monDom1;" mysql
    hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp;" \
		monDom1
    hgsql monDom1 < $HOME/kent/src/hg/lib/chromInfo.sql
    hgsql -e 'load data local infile "chromInfo.tab" into table chromInfo;' \
	monDom1
    #	generate chrom.sizes list in order by size, biggest first
    #	This listing is going to be used in a variety of procedures below
    twoBitInfo monDom1.2bit stdout | sort -rn +1 > chrom.sizes
    hgsql monDom1 -N -e 'select chrom from chromInfo' > chrom.lst

    # Enter monDom1 into dbDb and defaultDb so test browser knows about it:
    hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
	defaultPos, active, orderKey, genome, scientificName, \
	htmlPath, hgNearOk, hgPbOk, sourceName) \
	VALUES("monDom1", "Oct 2004", "/gbdb/monDom1", "M. domestica", \
	"scaffold_13303:1000000-11000000", 1, 33, "Opossum", \
	"Monodelphis domestica", \
	"/gbdb/monDom1/html/description.html", 0, 0, \
	"Broad Inst. Prelim Oct04");' \
	-h localhost hgcentraltest
    hgsql -e 'INSERT INTO defaultDb (name, genome) \
	VALUES("monDom1", "Opossum")' \
        -h localhost hgcentraltest

    # Make trackDb table so browser knows what tracks to expect (DONE - Hiram)
    cd $HOME/kent/src/hg/makeDb/trackDb
    mkdir /cluster/data/monDom1/html
    ln -s /cluster/data/monDom1/html /gbdb/monDom1/html 
    mkdir opossum
    cvs add opossum
    cd opossum
    mkdir monDom1
    cvs add monDom1
    cd monDom1
    cat << '_EOF_' > description.html
<HTML><HEAD> <TITLE> Monodelphis domestica </TITLE> </HEAD><BODY>

<H1> Description page to be done for Monodelphis domestica </H1>

</BODY></HTML>
'_EOF_'
    # << this line keeps emacs coloring happy
    cvs add description.html
    cvs commit
    cd ../..
    make DBS="monDom1" ZOO_DBS=""


#  CHUNK up scaffolds for repeat masking (DONE - Hiram - 2004-12-03)
#	See also: comments about this in: makeApiMel1.doc
    ssh kksilo
    cd /cluster/data/monDom1
    mkdir -p jkStuff
    faSplit size broad.mit.edu/scaffolds.fa 500000 -oneFile \
	  scaffoldsSplit -lift=jkStuff/scaffoldsSplit.lft

    time faSplit about scaffoldsSplit.fa 200000 chunks500k/chunk_
    #	about 9 minutes for each of those two faSplits
    mkdir -p /cluster/bluearc/monDom1
    #	copy to bluearc for cluster run
    cp -Rp ./chunks500k /cluster/bluearc/monDom1
    #	this business in /cluster/bluearc/monDom1/ was later removed
    #	and to iservers:
    ssh kkr1u00
    mkdir /iscratch/i/monDom1
    cd /iscratch/i/monDom1
    cp -Rp /cluster/bluearc/monDom1/chunks500k .
    /cluster/bin/iSync

# LOAD GAP & GOLD TABLES FROM AGP (DONE - 2004-12-03 - Hiram)
    ssh hgwdev
    cd /cluster/data/monDom1
    hgGoldGapGl -noGl monDom1 broad.mit.edu/assembly.agp
    #	Verify sanity of indexes
    hgsql -e "show index from gold;" monDom1
    hgsql -e "show index from gap;" monDom1
    #	Look at the Cardinality column, it should have some healthy
    #	numbers for all indices
    # For some reason, the indices do not get built correctly --
    # "show index from gap/gold" shows NULL cardinalities for chrom.  
    # Rebuild indices with "analyze table".
    hgsql -e "analyze table gold; analyze table gap;" monDom1


# RUN REPEAT MASKER (DONE - 2004-12-06 - Hiram)
    #	Received a library from Michele Clamp who says Damian Keefe
    #	at Ensembl created to use in addition to the generic mammal library.
    #	Will run both -mammal and this one, and combine the results.
    #	have placed the library in monDom1/jkStuff/monDom1.novel.RM.lib
    #	DIFFERENT fileserver, new disk space
    ssh kksilo
    mkdir /cluster/store1/monDom1
    mkdir /cluster/store1/monDom1/RMOut
    ln -s /cluster/store1/monDom1/RMOut /cluster/store8/monDom1
    ln -s /cluster/store8/monDom1/chunks500k /cluster/store1/monDom1
    ln -s /cluster/store8/monDom1/jkStuff /cluster/store1/monDom1
    #	
    cd /cluster/data/monDom1
    cat << '_EOF_' > jkStuff/RMOpossum
#!/bin/csh -fe

cd $1
/bin/mkdir -p /tmp/monDom1/$2
/bin/cp ../chunks500k/$2 /tmp/monDom1/$2/
pushd /tmp/monDom1/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -s -lib /iscratch/i/monDom1/monDom1.novel.RM.lib $2
mv $2.out $2.novel.out
/cluster/bluearc/RepeatMasker/RepeatMasker -s -species "Monodelphis domestica" $2
tail +4 $2.novel.out >> $2.out
popd
/bin/cp -p /tmp/monDom1/$2/$2.out ./
/bin/rm -fr /tmp/monDom1/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/monDom1/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/monDom1
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod +x jkStuff/RMOpossum
    mkdir RMRun
    bash	#	if you are csh
    ls chunks500k | while read chunk
    do
	echo ../jkStuff/RMOpossum /cluster/data/monDom1/RMOut $chunk \
           '{'check in line+ /iscratch/i/monDom1/chunks500k/$chunk'}' \
         '{'check out line+ /cluster/data/monDom1/RMOut/$chunk.out'}'
    done > RMRun/RMJobs
    exit	#	out of bash back to csh

    # do the run
    ssh kk
    mkdir /cluster/data/monDom1/RMRun
    cd /cluster/data/monDom1/RMRun
    para create RMJobs
    para try, check, push, check,...

    #	This takes extra long compared to normal because of the two
    #	passes being run on the data
Completed: 7627 of 7627 jobs
CPU time in finished jobs:  117842813s 1964046.88m 32734.11h 1363.92d  3.737 y
IO & Wait Time:                704870s   11747.84m   195.80h    8.16d  0.022 y
Average job time:               15543s     259.05m     4.32h    0.18d
Longest job:                    24045s     400.75m     6.68h    0.28d
Submission to last job:        185727s    3095.45m    51.59h    2.15d

    # Lift up the split-scaffold .out's to scaffold .out's
    ssh eieio
    cd /cluster/store1/monDom1
    mkdir RMOutLifted
    ln -s `pwd`/RMOutLifted /cluster/data/monDom1
    bash		# if you are csh
    find ./RMOut -type f | while read F
    do
        B=${F/.\/RMOut\//}
        echo "$F -> RMOutLifted/$B"
        liftUp RMOutLifted/$B jkStuff/scaffoldsSplit.lft warn $F
    done
    exit		# the bash session if your are csh

    # Cat all those lifted .out files together into one single file
    head -3 RMOutLifted/chunk_00.fa.out > RMRun/scaffolds.fa.RM.out
    find ./RMOutLifted -type f | while read f
    do
	tail +4 $f >> RMRun/scaffolds.fa.RM.out
    done
    #	I tried the following sort in the above loop to see if
    #	the sort mentioned below could be avoided:
    #	tail +4 $f | sort -u >> scaffolds.unique.fa.RM.out
    #	but this sort was not thorough enough.  It ended up with more
    #	records loaded into the database, but with the same amount
    #	of featureBits coverage.

    #	Eliminate the duplicates from the double RepeatMasker run.
    #	When run twice, it finds the Simple and Low Complexity repeats
    #	twice.
    hgLoadOut -tabFile=hgLoadOut.tab monDom1 -nosplit scaffolds.fa.RM.out 
    #	A few "Strange perc. field" outputs, which always happens, but
    #	not too many.  e.g.:
    #	Strange perc. field -1.4 line 1939539 of scaffolds.fa.RM.out
    #	Strange perc. field -2.5 line 3078381 of scaffolds.fa.RM.out
    #	Strange perc. field -2.1 line 3078381 of scaffolds.fa.RM.out
    #	Strange perc. field -40.9 line 3442991 of scaffolds.fa.RM.out
    #	Strange perc. field -3.9 line 3442991 of scaffolds.fa.RM.out
    #	... etc ...
    #	note: 3 records dropped due to repStart > repEnd
    ls -og scaffolds.fa.RM.out hgLoadOut.tab
    #	-rw-rw-r--  1 2553485808 Dec  6 00:05 scaffolds.fa.RM.out
    #	-rw-rw-r--  1 1935604506 Dec  6 00:16 hgLoadOut.tab

    sort -k 6,6 -k7n,7n hgLoadOut.tab | uniq > hgLoadOutUniq.tab 
    #	takes almost an hour

    ssh hgwdev
    cd /cluster/store1/monDom1
    #	Get the structure of the table started
    hgLoadOut -nosplit -table=rmsk monDom1 RMOutLifted/chunk_1000.fa.out
    #	Now, reload it fully with the unique set
    hgsql -e 'delete from rmsk;' monDom1
    hgsql -e 'load data local infile "hgLoadOutUniq.tab" into table rmsk;' \
	monDom1
    #	This load takes an hour, 12 minutes for a featureBits:
    featureBits monDom1 rmsk
    #	1775646140 bases of 3492108230 (50.847%) in intersection
    featureBits hg17 rmsk
    #	1390952984 bases of 2866216770 (48.529%) in intersection
    featureBits canFam1 rmsk
    #	896773874 bases of 2359845093 (38.001%) in intersection
    featureBits mm5 rmsk
    #	1137310280 bases of 2615483787 (43.484%) in intersection
    featureBits rn3 rmsk
    #	1117483165 bases of 2571104688 (43.463%) in intersection
    featureBits panTro1 rmsk
    #	1311281819 bases of 2733948177 (47.963%) in intersection


    #	Verify the indexes are correct
    hgsql -e "show index from rmsk;" monDom1
    #	Should be a good cardinality count for each one
    #	It does not, it shows None for genoName (genoName(14), bin)
    #	and 18744904 for both (genomeName(14), genoStart) and (...genoEnd)
    hgsql monDom1 -e 'drop index genoName       on rmsk; \
                  drop index genoName_2 on rmsk; \
                  drop index genoName_3   on rmsk; \
                  create index bin       on rmsk (genoName(14), bin); \
                  create index genoStart on rmsk (genoName(14), genoStart); \
                  create index genoEnd   on rmsk (genoName(14), genoEnd);'
    #	This index creation took 76 minutes
    #	After that, the "bin" index has a cardinality of 48,561
    #	genoStart and genoEnd are still at 18,744,904

# SIMPLE REPEAT [TRF] TRACK (DONE - 2004-12-03 - Hiram)
    #	Run this on the rack 9 cluster
    ssh kk9
    mkdir /cluster/data/monDom1/bed/simpleRepeat
    cd /cluster/data/monDom1/bed/simpleRepeat
    mkdir trf
    cat << '_EOF_' > runTrf
#!/bin/csh -fe
#
set path1 = $1
set inputFN = $1:t
set outpath = $2
set outputFN = $2:t
mkdir -p /tmp/$outputFN
cp $path1 /tmp/$outputFN
pushd .
cd /tmp/$outputFN
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
popd
rm -f $outpath
cp -p /tmp/$outputFN/$outputFN $outpath
rm -fr /tmp/$outputFN/*
rmdir --ignore-fail-on-non-empty /tmp/$outputFN
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x runTrf

    cat << '_EOF_' > gsub
#LOOP
./runTrf {check in line+ $(path1)}  {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    ls -1S /iscratch/i/monDom1/chunks500k | sed -e "s#^#/iscratch/i/monDom1/chunks500k/#" > genome.lst
    gensub2 genome.lst single gsub jobList
    para create jobList
    para try
    para check
    para push
Completed: 7627 of 7627 jobs
CPU time in finished jobs:      24845s     414.08m     6.90h    0.29d  0.001 y
IO & Wait Time:                 19407s     323.45m     5.39h    0.22d  0.001 y
Average job time:                   6s       0.10m     0.00h    0.00d
Longest job:                       49s       0.82m     0.01h    0.00d
Submission to last job:          1309s      21.82m     0.36h    0.02d


    find ./trf -type f | xargs cat | \
	liftUp simpleRepeat.bed \
	/cluster/data/monDom1/jkStuff/scaffoldsSplit.lft \
	warn stdin  > lu.out 2>&1

    # Load into the database:
    ssh hgwdev
    cd /cluster/data/monDom1/bed/simpleRepeat
    hgLoadBed monDom1 simpleRepeat simpleRepeat.bed \
	-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    #	Loaded 697668 elements of size 16
    #	Compare with some other assemblies
    featureBits monDom1 simpleRepeat
    #	55000238 bases of 3492108230 (1.575%) in intersection
    featureBits rn3 simpleRepeat
    #	70073656 bases of 2571104688 (2.725%) in intersection
    featureBits mm5 simpleRepeat
    #	81414259 bases of 2615483787 (3.113%) in intersection
    featureBits monDom1 simpleRepeat
    #	54952425 bases of 2866216770 (1.917%) in intersection

# CREATE MICROSAT TRACK (done 2006-7-5 JK)
     ssh hgwdev
     cd /cluster/data/monDom1/bed
     mkdir microsat
     cd microsat
     awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed 
    /cluster/bin/i386/hgLoadBed monDom1 microsat microsat.bed

# PROCESS SIMPLE REPEATS INTO MASK (DONE - 2004-12-06 - Hiram)
    # After the simpleRepeats track has been built, make a filtered version 
    # of the trf output: keep trf's with period <= 12:
    ssh eieio
    cd /cluster/data/monDom1/bed/simpleRepeat
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed

# MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE - 2004-12-06 - Hiram)
    #	maskOutFa is trying to read in the entire input sequence to get
    #	it ready for masking.  This doesn't work on the ordinary memory
    #	machines since scaffolds.fa is the whole thing.
    ssh kolossus
    mkdir /scratch/monDom1
    cd /scratch/monDom1
    cp -p /cluster/store8/monDom1/broad.mit.edu/scaffolds.fa .
    cp -p /cluster/store1/monDom1/bed/simpleRepeat/trfMask.bed .
    cp -p /cluster/store1/monDom1/RMRun/scaffolds.fa.RM.out .

    maskOutFa -soft scaffolds.fa trfMask.bed maskedScaffolds.fa
    maskOutFa -clip -softAdd maskedScaffolds.fa scaffolds.fa.RM.out \
	maskedScaffolds.fa
    #	those two mask operations each take 3 to 4 minutes.
    #	There were a number of negative rEnd warnings on scaffold_13261,
    #	all on type L1Md_T, one warning each on
    #	scaffold_13244, scaffold_13338 for type L1M4c,
    #	and one on scaffold_13498 for type monodelphis_domestica_23_1
    #	and one warning:
    #	Mask record out of range line 6686033 of scaffolds.fa.RM.out
    #	scaffold_13485 size is 3678616,  range is 3678568-3678618
    #	This last error makes it stop without doing anything.  Need to
    #	fix this.  Use the -clip option.  That helps it past that and
    #	now more warnings are seen, on scaffold_16808 for type Lx5,
    #	and one warning on scaffold_14879 for type monodelphis_domestica_48

    #	May as well create a 2bit while we are here:
    time faToTwoBit maskedScaffolds.fa masked.2bit
    #	3 minutes
    #	Then, check it:
    twoBitToFa masked.2bit stdout | faCount stdin | tail -1
    #	4 minutes, seems to have lost 2A's and 1 C:
    # total   3563247202      1085390298      660684418       660336425
    #      1085697086      71138975        16821709
    #	previously:
    # total   3563247205      1085390300      660684418       660336426
    #      1085697086      71138975        16821709
    #	I went back through this sequence and could not duplicate this
    #	failure of missing three bases ...

    #	Copy the masked 2bit file back to kksilo
    ssh kksilo
    cd /cluster/data/monDom1
    scp -p kolossus:/scratch/hiram/monDom1/masked.2bit .
    rm monDom1.2bit
    mv masked.2bit monDom1.2bit
    #	Ask for the blat servers to be restarted with this new masked
    #	sequence
    # Now clean up the unmasked split scaffolds to avoid confusion later.
    rm -r chunks500k scaffoldsSplit.fa jkStuff/scaffoldsSplit.lft

# MAKE 11.OOC FILE FOR BLAT (TBD)
    ssh kolossus
    cd /cluster/bluearc/monDom1
    #	use repMatch of 1228 as this genome is ~ %20 larger than human
    #	1024 + (1024 * 0.2) = 1228
    blat /cluster/data/monDom1/monDom1.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1228
    #	Wrote 43126 overused 11-mers to 11.ooc
    #	-rw-rw-r--    1   172512 Dec  7 17:22 11.ooc
    ssh kkr1u00
    cd /iscratch/i/monDom1
    cp -p /cluster/bluearc/monDom1/11.ooc .
     /cluster/bin/iSync

    #	at repMatch=1024
    #	Wrote 61187 overused 11-mers to 11.ooc
    #	at repMatch=540
    #	Wrote 207412 overused 11-mers to 11.ooc
    #	For comparison, Hg17 has:
    #	Wrote 30077 overused 11-mers to 11.ooc

# PUT SEQUENCE ON SCRATCH FOR BLASTZ (DONE - 2004-12-07 - Hiram)
    #	These chunks do not need to correspond to the 500,000 chunks
    #	used in the repeat mask run because we are not going to use the
    #	repeat mask .out files at lineage specific repeats.

    ssh kksilo
    cd /tmp
    #	Make sure you have enough room for this: (~ 4 Gb)
    df -h .
    #	Filesystem            Size  Used Avail Use% Mounted on
    #	/dev/sda2              32G   23G  8.0G  74% /

    #	split scaffolds that are over 2,000,000
    faSplit size /cluster/data/monDom1/maskedScaffolds.fa 2000000 -oneFile \
	scaf_ \
	-lift=/cluster/data/monDom1/jkStuff/chunkedScaffolds.lft

    mkdir /cluster/bluearc/scratch/monDom1
    cd /cluster/bluearc/scratch/monDom1
    mkdir chunks
    #	Collect the split scaffolds into reasonably sized chunks
    #	This produces over 5000 files of size 500,000 
    #	a few smaller, a few larger, 7627 total files
    faSplit about /tmp/scaf_.fa 10000000 chunks/chunk_
    #	done with this temp file
    rm /tmp/scaf_.fa
    #	a list for convenience
    ls -1S chunks > chunk.list
    cp -p /cluster/data/monDom1/jkStuff/chunkedScaffolds.lft .
    cp -p /cluster/data/monDom1/monDom1.2bit .

    #	And create a chunks.2bit for the lavToAxt operations
    ssh kkr1u00
    cd /iscratch/i/monDom1/chunks
    faToTwoBit chunks*.fa ../chunks.2bit
    cd ..
    twoBitInfo chunks.2bit chunks.info
    /cluster/bin/iSync
    #	request rsync of scratch to the cluster

# PRODUCING GENSCAN PREDICTIONS (DONE - 2004-12-07 - Hiram)
# PRODUCING GENSCAN PREDICTIONS (REDONE - 2005-01-07 - Angie  
#                                          note added by kuhn)
# genscanPep table touched by alisq to satisfy joinerCheck 2005-03-32 - kuhn)

    ssh kolossus
    cd /scratch/monDom1
    time maskOutFa maskedScaffolds.fa hard hardMaskedScaffolds.fa
    #	Clean out the /cluster/bluearc/monDom1/ files that were used
    #	during repeat masking, then, this copy
    cp -p hardMaskedScaffolds.fa /cluster/bluearc/monDom1
    cd /cluster/bluearc/monDom1
    mkdir hardChunks
    faSplit about hardMaskedScaffolds.fa 200000 hardChunks/chunk_
    #	This leaves some rather large pieces, will have to see if
    #	genscan can handle it.  (2086 files)

    ssh hgwdev
    mkdir /cluster/data/monDom1/bed/genscan
    cd /cluster/data/monDom1/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux
    # Make 3 subdirectories for genscan to place output files
    mkdir gtf pep subopt

    ssh kk9
    cd /cluster/data/monDom1/bed/genscan
    ls -1S /cluster/bluearc/monDom1/hardChunks/chunk*.fa > chunks.list
    cat << '_EOF_' > gsub
#LOOP
gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    gensub2 chunks.list single gsub jobList
    para create jobList
    para try, check, push, check, ...
# Completed: 2086 of 2086 jobs
# CPU time in finished jobs:     130316s    2171.94m    36.20h    1.51d  0.004 y
# IO & Wait Time:                 17821s     297.01m     4.95h    0.21d  0.001 y
# Average job time:                  71s       1.18m     0.02h    0.00d
# Longest job:                     2509s      41.82m     0.70h    0.03d
# Submission to last job:          2573s      42.88m     0.71h    0.03d

    # Concatenate scaffold-level results:
    ssh eieio
    cd /cluster/data/monDom1/bed/genscan
    cat gtf/*.gtf > genscan.gtf
    cat subopt/*.bed > genscanSubopt.bed
    cat pep/*.pep > genscan.pep



    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/monDom1/bed/genscan
    # Reloaded without -genePredExt 1/6/05:
    ldHgGene -gtf monDom1 genscan genscan.gtf
    hgPepPred monDom1 generic genscanPep genscan.pep
    hgLoadBed monDom1 genscanSubopt genscanSubopt.bed

    #	41090 sequences added
    #	Compare with other assemblies:
    featureBits monDom1 genscan
    #	47170795 bases of 3492108230 (1.351%) in intersection
    featureBits monDom1 genscanSubopt
    #	51805835 bases of 3492108230 (1.484%) in intersection
    featureBits hg17 genscan
    #	55323340 bases of 2866216770 (1.930%) in intersection
    featureBits hg17 genscanSubopt
    #	55986178 bases of 2866216770 (1.953%) in intersection

    # Clean up
    #	Save a copy of the hardMask for later .zip prep:
    ssh kksilo
    cp -p /cluster/bluearc/monDom1/hardMaskedScaffolds.fa .
    ssh eieio
    cd /cluster/bluearc/monDom1
    rm -fr hardChunks hardMaskedScaffolds.fa
    #	This kolossus clean up was actually left until much later.
    #	I found these copies here to be useful later on.
    ssh kolossus
    cd /scratch/monDom1
    rm -f *

# BLAT SERVER SETUP (DONE - 2004-12-07 - Hiram)

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES("monDom1", "blat14", 17786, 1, 0);' \
	-h localhost hgcentraltest
    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES("monDom1", "blat14", 17787, 0, 0);' \
	-h localhost hgcentraltest

    # Make trackDb table so browser knows what tracks to expect (DONE - Hiram)

# AUTO UPDATE GENBANK MRNA RUN  (DONE - 2004-12-15 - Hiram)
    ssh hgwdev
    #	In the source tree, since this is a new organism, go to:
    #	kent/src/hg/makeDb/genbank/src/lib
    #	And edit gbGenome.c to add monDom in two places for two lists:
    #	static char *monDomNames[] = {"Monodelphis domestica", NULL};
    #	{"monDom", monDomNames, NULL},
    #	Genbank species list is kept in:
    #	/cluster/data/genbank/data/species.lst
    #	This one is already on there.
    #	And add an entry for the ooc file in:
    #	hg/makeDb/genbank/src/align/gbBlat
    #	In a clean source tree, go to hg/makeDb/genbank
    #	and run a 'make install-server'

    cd /cluster/data/genbank
    # This is a new organism, edit the etc/genbank.conf file and add:
	# monDom1
	monDom1.genome = /scratch/monDom1/monDom1.2bit
	monDom1.refseq.mrna.native.load = no
	monDom1.mondoTwoBitParts = 7000
	monDom1.downloadDir = monDom1
	monDom1.lift = no
	monDom1.perChromTables = no

    #	Do the refseq's first, they are the quick ones
    ssh eieio
    cd /cluster/data/genbank
    nice bin/gbAlignStep -srcDb=refseq -type=mrna -verbose=1 -initial monDom1
    #	var/build/logs/2004.12.08-19:09:59.monDom1.initalign.log
    #	it said: "nothing to align" - i.e. there are no native refseq
    #	mRNA's for this organism, add the line:
    #	monDom1.refseq.mrna.native.load = no
    #	to the configuration file

    #	We were having some trouble with eieio, running these on kksilo
    #	this time.  This is best done on eieio.
    #	perform the genbank mrna run, this does only native and xeno mRNA's
    ssh kksilo
    nice bin/gbAlignStep -srcDb=genbank -type=mrna -verbose=1 -initial monDom1
    #	This run had non-linear behavior, in the early stages it
    #	appeared that it was going to take 4 days of run time, then in
    #	the last 7 hours it processed over 250,000 jobs:
# Completed: 357938 of 357938 jobs
# CPU time in finished jobs:   51924178s  865402.97m 14423.38h  600.97d  1.647 y
# IO & Wait Time:               3461243s   57687.38m   961.46h   40.06d  0.110 y
# Average job time:                 155s       2.58m     0.04h    0.00d
# Longest job:                     3885s      64.75m     1.08h    0.04d
# Submission to last job:        101926s    1698.77m    28.31h    1.18d

    #	Due to various interruptions, the cluster job was completed
    #	manually by going to the directory and pushing the jobs.  To
    #	clean up after this, and finish the alignment steps:
    ssh kksilo
    cd /cluster/data/genbank
    nice bin/gbAlignStep -continue=finish -srcDb=genbank -type=mrna \
	-verbose=1 -initial monDom1
    #	2 hour finish time

    # Load the results from the above
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad monDom1
    #	2 minute load time

    #	And finally, the long-lived EST alignments.  Several cluster days
    ssh kksilo
    cd /cluster/data/genbank/work
    mv initial.monDom1 initial.monDom1.genbank.mrna

    cd /cluster/data/genbank
    nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial monDom1
    # logFile: var/build/logs/2004.12.15-13:16:59.monDom1.initalign.log
    #	It said there was nothing to align, must be no native est's for
    #	the Opossum, add this line to the genbank.conf file:
    #	monDom1.genbank.est.native.load = no
    #	not useful to do xeno ESTs, they are off by default

# GC5BASE (DONE - 2004-12-08 - Hiram)
    ssh eieio
    mkdir /cluster/data/monDom1/bed/gc5Base
    cd /cluster/data/monDom1/bed/gc5Base
    hgGcPercent -wigOut -doGaps -file=stdout -win=5 monDom1 \
        /cluster/data/monDom1 | wigEncode stdin gc5Base.wig gc5Base.wib
    # takes about 4 hours, need to improve hgGcPercent and how it reads
    # five bases at a time from the sequence.

    ssh hgwdev
    cd /cluster/data/monDom1/bed/gc5Base
    mkdir /gbdb/monDom1/wib
    ln -s `pwd`/gc5Base.wib /gbdb/monDom1/wib
    hgLoadWiggle -pathPrefix=/gbdb/monDom1/wib monDom1 gc5Base gc5Base.wig

    #	need to fix-up hgLoadWiggle so it will make a proper index on
    #	these scaffold assemblies:
    hgsql monDom1 -e "drop index chrom on gc5Base;"
    hgsql monDom1 -e "create index bin on gc5Base (chrom(14), bin);"
    #	this is fixed in hgLoadWiggle as of 2004-12-09

# QUALITY (DONE - 2004-12-14 - Hiram)
    ssh eieio
    mkdir /cluster/data/monDom1/bed/quality
    cd /cluster/data/monDom1/bed/quality
    #	Create perl script to read the quality file and pipe to
    #	wigEncode:
    ./qaToWigEncode.pl ../../broad.mit.edu/assembly.agp.qual \
	| wigEncode stdin quality.wig quality.wib
    #	Converted stdin, upper limit 68.00, lower limit 0.00
    #	Took 6 hours 40 minutes of processing time.
    #	Resulting files:
    ls -og quality.wi?
    #	-rw-rw-r--  1  377775898 Dec 13 21:40 quality.wig
    #	-rw-rw-r--  1 3492108230 Dec 13 21:40 quality.wib


    ssh hgwdev
    cd /cluster/data/monDom1/bed/quality
    ln -s `pwd`/quality.wib /gbdb/monDom1/wib
    hgLoadWiggle -pathPrefix=/gbdb/monDom1/wib monDom1 quality quality.wig
    #	fix the index  (this was supposed to be fixed, hgLoadWiggle
    #	needs to be looked at)
    hgsql monDom1 -e "drop index chrom on quality;"
    hgsql monDom1 -e "create index bin on quality (chrom(14), bin);"

# ALIGN TO HUMAN (DONE - 2004-12-12 - Hiram)
#	NAMING CONVENTION: the zb.hg17 directory will be the Opossum
#	aligned to the Human blastz result.
#	The bz.hg17 directory will the swapped result, Human aligned
#	to Opossum 
  ssh kk
  mkdir -p /cluster/data/monDom1/bed/zb.hg17
  cd /cluster/data/monDom1/bed/zb.hg17
  ln -s /cluster/data/monDom1/bed/zb.hg17 /cluster/data/hg17/bed/blastz.monDom1

  # Create DEF file for run of scaffolds vs. human chromosomes
  cat << '_EOF_' > DEF
# human vs. M.domestica
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
# Set up blastz parameters using parameters between (chicken and fish ?)
# but not abridging repeats since can't do that with scaffolds, and
# it's not very relevant at this evolutionary distance.
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Human
SEQ1_DIR=/scratch/hg/gs.18/build35/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Opossum
SEQ2_DIR=/iscratch/i/monDom1/chunks
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/monDom1/bed/zb.hg17

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
  # << this line keeps emacs coloring happy

    cp -p /cluster/data/hg17/chrom.sizes S1.len
    cp -p /cluster/data/monDom1/chrom.sizes S2.len
    # script copied over from /cluster/data/hg17/jkStuff/BlastZ_run0.sh
    /cluster/data/monDom1/jkStuff/BlastZ_run0.sh

    cd run.0
    para try, push check, para push, para check....
# Completed: 112871 of 112871 jobs
# CPU time in finished jobs:   42061387s  701023.12m 11683.72h  486.82d  1.334 y
# IO & Wait Time:                578862s    9647.70m   160.80h    6.70d  0.018 y
# Average job time:                 378s       6.30m     0.10h    0.00d
# Longest job:                     3157s      52.62m     0.88h    0.04d
# Submission to last job:         81243s    1354.05m    22.57h    0.94d

    #	Second cluster run to convert the .out's to .lav's
    #	You do NOT want to run this on the big cluster.  It brings
    #	the file server to its knees.  Run this on the small cluster.
    ssh kki
    cd /cluster/data/monDom1/bed/zb.hg17
    # script copied over from /cluster/data/hg17/jkStuff/BlastZ_run1.sh
    /cluster/data/monDom1/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# Completed: 341 of 341 jobs
# CPU time in finished jobs:       3525s      58.75m     0.98h    0.04d  0.000 y
# IO & Wait Time:                 11315s     188.58m     3.14h    0.13d  0.000 y
# Average job time:                  44s       0.73m     0.01h    0.00d
# Longest job:                      181s       3.02m     0.05h    0.00d
# Submission to last job:          1524s      25.40m     0.42h    0.02d

    # third run: lav -> axt (an I/O bound process, run only on fileserver)
    #	Use the chunks.2bit so lavToAxt will work. Then lift the resulting axt

    ssh eieio
    cd /cluster/data/monDom1/bed/zb.hg17
    mkdir axtChrom pslChrom

    cat << '_EOF_' > lav2axt.sh
#!/bin/sh
for L in lav/chr*
do
    chr=${L/lav\//}
    cd ${L}
    echo -n "${L} -> ${chr}.axt working ... "
    cat `ls -1 *.lav | sort -g` \
	| lavToAxt stdin \
	/iscratch/i/hg17/bothMaskedNibs \
	/iscratch/i/monDom1/chunks.2bit stdout \
	| axtSort stdin stdout | liftUp -axtQ ../../axtChrom/${chr}.axt \
	    /iscratch/i/monDom1/chunkedScaffolds.lft warn stdin
    cd ../..
    echo -n "${chr}.axt -> ${chr}.psl working ..."
    axtToPsl axtChrom/${chr}.axt S1.len S2.len pslChrom/${chr}.psl
    echo "DONE"
done
'_EOF_'
    # << this line keeps emacs coloring happy

    chmod +x lav2axt.sh
    ./lav2axt.sh

    #	load this blast result just to see how it looks
    #	It is not needed in the browser after the chains and nets are
    #	made, drop it later.
    ssh hgwdev
    cd /cluster/data/monDom1/bed/zb.hg17/pslChrom
    hgLoadPsl -table=blastzMonDom1 -noTNameIx hg17 chrM.psl
    bash			#	if you are normally csh
    for I in `ls *.psl | grep -v chrM.psl`
    do
        /cluster/bin/i386/hgLoadPsl -table=blastzMonDom1 -append \
		-noTNameIx hg17 ${I}
        echo "done: ${I}"
    done
    exit			#	exit bash if you are normally csh
    #	Let's see what featureBits has to say about this data
    featureBits hg17 blastzMonDom1
    #	486216407 bases of 2866216770 (16.964%) in intersection
    featureBits hg17 blastzMm5
    #	1052077141 bases of 2866216770 (36.706%) in intersection
    featureBits hg17 blastzRn3
    #	1013003285 bases of 2866216770 (35.343%) in intersection



# CHAIN HUMAN BLASTZ  (DONE - 2004-12-12 - Hiram)
    # Run axtChain on file server, job is mostly I/O, cluster does not help
    ssh eieio
    cd /cluster/data/monDom1/bed/zb.hg17
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chainRaw

    ls -1S /cluster/data/monDom1/bed/zb.hg17/axtChrom/*.axt > input.lst
    cat << '_EOF_' > doChain.sh
#!/bin/sh
for chrAxtFile in `cat input.lst`
do
    chr=`basename ${chrAxtFile}`
    axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
	-linearGap=../../opossumHumanTuned.gap \
	-minScore=5000 ${chrAxtFile} \
	/cluster/data/hg17/nib \
	/cluster/data/monDom1/monDom1.2bit chainRaw/${chr}.chain > out/${chr}.out
done
'_EOF_'
    # << this line keeps emacs coloring happy

    # Reuse gap penalties from chicken run.
    cat << '_EOF_' | sed 's/  */\t/g' > ../../opossumHumanTuned.gap
tablesize	11
smallSize	111
position	1	2	3	11	111	2111	12111	32111	72111	152111	252111
qGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
tGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
bothGap	625	660	700	750	900	1400	4000	8000	16000	32000	57000
'_EOF_'
    # << this line keeps emacs coloring happy

    chmod +x doChain
    ./doChain

    # On the file server weed the chains for repeats
    ssh eieio
    cd /cluster/data/monDom1/bed/zb.hg17/axtChain
    mkdir chain
    cd chainRaw
    bash			#	if you are normally csh
    for i in *.chain
    do
        chainAntiRepeat /cluster/data/hg17/nib \
		/cluster/data/monDom1/monDom1.2bit $i ../chain/$i
    done
    exit			#	exit bash if you are normally csh

    # now on the file server, sort chains.  The chainMergeSort and chainSplit
    # steps both take about 10 minutes
    ssh kksilo
    cd /cluster/data/monDom1/bed/zb.hg17/axtChain
    chainMergeSort chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # Load chains into database.  This takes an hour
    ssh hgwdev
    cd /cluster/data/monDom1/bed/zb.hg17/axtChain/chain
for i in *.chain
do
    c=${i/.chain/}
    echo "loading ${c}"
    hgLoadChain hg17 ${c}_chainMonDom1 $i
done

    featureBits hg17 chainMonDom1
    #	2618859142 bases of 2866216770 (91.370%) in intersection
    featureBits hg17 chainMonDom1Link
    #	456069062 bases of 2866216770 (15.912%) in intersection

    #	create a copy of these things for hgdownload  (2005-01-12 - Hiram)
    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsMonDom1
    cd /cluster/data/monDom1/bed/zb.hg17/axtChain/chain
    tar cvzf \
	/usr/local/apache/htdocs/goldenPath/hg17/vsMonDom1/opossum.chain.tgz \
	./chr*.chain

# NET OPOSSUM TO HUMAN BLASTZ (DONE - 2004-12-13 - Hiram)
    ssh eieio
    cd /cluster/data/monDom1/bed/zb.hg17/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net
    #	11 minutes

    # Create axt files for the net as so:
    netSplit noClass.net noClass
    #	2 minutes
    cd noClass
    mkdir ../../axtNet
    bash		#	if you are csh normally
    for i in *.net
    do
	r=${i/.net/}
	netToAxt $i ../chain/${r}.chain /cluster/data/hg17/nib \
		/cluster/data/monDom1/monDom1.2bit ../../axtNet/${r}.axt
    done
    #	18 minutes
    exit		#	back to csh if you are csh normally

    cd ..
    rm -r noClass
    
    # Add classification info using db tables: 
    ssh hgwdev
    cd /cluster/data/monDom1/bed/zb.hg17/axtChain
    netClass -noAr noClass.net hg17 monDom1 human.net
    #	41 minutes

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/monDom1/bed/zb.hg17/axtChain
    netFilter -minGap=10 human.net |  hgLoadNet hg17 netMonDom1 stdin
    #	4 minutes
    #	compare featureBits with some other assemblies
    featureBits hg17 netMonDom1
    #	2595812636 bases of 2866216770 (90.566%) in intersection
    featureBits hg17 netBosTau1
    #	2261085097 bases of 2866216770 (78.887%) in intersection
    featureBits hg17 netRn3
    #	2817656275 bases of 2866216770 (98.306%) in intersection

    #	create a copy of these things for hgdownload  (2005-01-12 - Hiram)
    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsMonDom1
    cd /cluster/data/monDom1/bed/zb.hg17/axtChain
    cp -p human.net \
	/usr/local/apache/htdocs/goldenPath/hg17/vsMonDom1/opossum.net
    cd /usr/local/apache/htdocs/goldenPath/hg17/vsMonDom1
    gzip opossum.net
    #	The axtNet files need to go too (2005-02-24 - Hiram)
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/hg17/vsMonDom1/axtNet
    cd /usr/local/apache/htdocs/goldenPath/hg17/vsMonDom1/axtNet
    cp -p /cluster/data/monDom1/bed/zb.hg17/axtNet/*.axt .
    gzip *

    # index had NULL cardinality; analyze to fix (2005-01-18 - Heather)
    hgsql hg17 -e "analyze table netMonDom1;"

# MAKE NET MAFS FOR MULTIZ (2004-12-24 kate)
    ssh kksilo
    cd /cluster/data/monDom1/bed/zb.hg17/axtNet
    mkdir -p ../mafNet
    cat *.axt | axtSort stdin stdout | axtToMaf -tSplit stdin \
     -tPrefix=hg17. /cluster/data/hg17/chrom.sizes \
     -qPrefix=monDom1. /cluster/data/monDom1/chrom.sizes ../mafNet


# SWAP EVERYTHING AND REMAKE NET FROM OPOSSUM REFERENCE
#	Create HUMAN TO OPOSSUM chain, net
    ssh eieio
    cd /cluster/data/monDom1/bed
    mkdir -p bz.hg17/axtChain
    chainSwap zb.hg17/axtChain/all.chain bz.hg17/axtChain/all.chain
    #	4 minutes

    cd bz.hg17/axtChain
    chainPreNet all.chain ../../zb.hg17/S2.len ../../zb.hg17/S1.len stdout \
	| chainNet stdin -minSpace=1 ../../zb.hg17/S2.len \
		../../zb.hg17/S1.len stdout /dev/null \
	| netSyntenic stdin noClass.net
    #	10 minutes
    mkdir ../axtNet
    netToAxt noClass.net all.chain /cluster/data/monDom1/monDom1.2bit \
	/cluster/data/hg17/nib ../axtNet/all.axt
    #	23 minutes

    # Classify net and load into database
    ssh hgwdev
    cd /cluster/data/monDom1/bed/bz.hg17/axtChain
    netClass -noAr noClass.net monDom1 hg17 monDom1.net
    #	16 minutes
    netFilter -minGap=10 monDom1.net | hgLoadNet monDom1 netHg17 stdin
    #	6 minutes

    # Load chains into database (takes about 4 hours).
    hgLoadChain -tIndex monDom1 chainHg17 all.chain
    # if you do a 'show index' SQL reports Cardinality NULL for the tName
    #	indexes although they are actually working.   To make the
    #	cardinality show up:
    hgsql monDom1 -e "analyze table chainHg17;"
    hgsql monDom1 -e "analyze table chainHg17Link;"

    #	check featureBits  (~ 30 minute commands)
    featureBits monDom1 chainHg17Link
    #	459086956 bases of 3492108230 (13.146%) in intersection
    featureBits monDom1 chainHg17
    #	3246158618 bases of 3492108230 (92.957%) in intersection
    featureBits monDom1 netHg17
    #	3223616544 bases of 3492108230 (92.311%) in intersection

    #	create a copy of for hgdownload  (2005-01-13 - Hiram)
    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/monDom1/vsHg17
    cd /cluster/data/monDom1/bed/bz.hg17/axtChain
    gzip --stdout all.chain > \
	/usr/local/apache/htdocs/goldenPath/monDom1/vsHg17/human.chain.gz
    gzip --stdout monDom1.net > \
	/usr/local/apache/htdocs/goldenPath/monDom1/vsHg17/human.net.gz

# BLASTZ HUMAN CLEAN UP (DONE - 2005-01-12 - Hiram)
    ssh eieio
    cd /cluster/data/monDom1/bed/zb.hg17/axtChain
    nice rm -rf noClass.net chain/chain.tab chain/link.tab chain.run1 \
	chainRaw out &
    nice gzip chain/* &
    nice gzip human.net &
    nice gzip all.chain &

    cd /cluster/data/monDom1/bed/zb.hg17
    nice rm -rf raw &
    nice gzip axtChrom/maf/* &
    nice gzip axtChrom/*.axt &
    nice gzip pslChrom/* lav/*/* &

    cd /cluster/data/monDom1/bed/bz.hg17/axtChain
    nice rm -f noClass.net link.tab chain.tab &
    nice gzip all.chain monDom1.net &
    cd ../axtNet
    nice gzip all.axt &

# LOAD CPGISSLANDS (DONE - 2004-12-13 - Hiram)
    ssh hgwdev
    mkdir -p /cluster/data/monDom1/bed/cpgIsland
    cd /cluster/data/monDom1/bed/cpgIsland
    # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
    cvs co hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    make
    mv cpglh.exe /cluster/data/monDom1/bed/cpgIsland/
    
    ssh eieio
    cd /cluster/data/monDom1/bed/cpgIsland

    # AWK filter to transform cpglh output to bed +
    cat << '_EOF_' > filter.awk
/* Input columns: */
/* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
/* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
/* Output columns: */
/* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
/* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
    # << this line keeps emacs coloring happy

    rm -f cpgIsland.bed
    cat /dev/null > cpgIsland.bed
    bash		#	if you are normally csh
    for scaffold in `awk '{print $1}' /cluster/data/monDom1/chrom.sizes`
    do
	cpgOutput=${scaffold}.cpg
	echo "${scaffold}"
	rm -f /tmp/${scaffold}.fa
	twoBitToFa /cluster/data/monDom1/monDom1.2bit:${scaffold} \
		/tmp/${scaffold}.fa
	./cpglh.exe /tmp/${scaffold}.fa | awk -f filter.awk >> cpgIsland.bed
	rm -f /tmp/${scaffold}.fa
    done
    exit		#	exit bash if you are normally csh

    # load into database:
    ssh hgwdev
    cd /cluster/data/monDom1/bed/cpgIsland
    hgLoadBed monDom1 cpgIslandExt -tab -noBin \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
    #	Loaded 27004 elements of size 10

    #	compare to some other assemblies
    featureBits monDom1 cpgIslandExt
    #	18445119 bases of 3492108230 (0.528%) in intersection
    featureBits hg17 cpgIslandExt
    #	21245283 bases of 2866216770 (0.741%) in intersection
    featureBits panTro1 cpgIsland
    #	17540676 bases of 2733948177 (0.642%) in intersection
    featureBits rn3 cpgIsland
    #	9679881 bases of 2571104688 (0.376%) in intersection
    featureBits xenTro1 cpgIslandExt
    #	19279778 bases of 1381238994 (1.396%) in intersection
    featureBits fr1 cpgIsland
    #	25518433 bases of 315518167 (8.088%) in intersection

# ALIGN TO CHICKEN (DONE - 2004-12-14 - Hiram)
#	NAMING CONVENTION: the zb.galGal2 directory will be the Opossum
#	aligned to the Chicken blastz result.
#	The bz.galGal2 directory will the swapped result, Chicken aligned
#	to Opossum 
  ssh kk
  mkdir -p /cluster/data/monDom1/bed/zb.galGal2
  cd /cluster/data/monDom1/bed/zb.galGal2

  # Create DEF file for run of scaffolds vs. chicken chromosomes
  cat << '_EOF_' > DEF
# chicken vs. M.domestica
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
# Set up blastz parameters using parameters between (chicken and fish ?)
# but not abridging repeats since can't do that with scaffolds, and
# it's not very relevant at this evolutionary distance.
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: chicken
SEQ1_DIR=/iscratch/i/galGal2/nib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Opossum
SEQ2_DIR=/iscratch/i/monDom1/chunks
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/monDom1/bed/zb.galGal2

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
  # << this line keeps emacs coloring happy

    cp -p /cluster/data/galGal2/chrom.sizes S1.len
    cp -p /cluster/data/monDom1/chrom.sizes S2.len
    # script copied over from /cluster/data/galGal2/jkStuff/BlastZ_run0.sh
    /cluster/data/monDom1/jkStuff/BlastZ_run0.sh

    cd run.0
    para try, push check, para push, para check....
# Completed: 49981 of 49981 jobs
# CPU time in finished jobs:   24745519s  412425.31m  6873.76h  286.41d  0.785 y
# IO & Wait Time:                204758s    3412.64m    56.88h    2.37d  0.006 y
# Average job time:                 499s       8.32m     0.14h    0.01d
# Longest job:                     2857s      47.62m     0.79h    0.03d
# Submission to last job:         85888s    1431.47m    23.86h    0.99d

    #	Second cluster run to convert the .out's to .lav's
    #	You do NOT want to run this on the big cluster.  It brings
    #	the file server to its knees.  Run this on the small cluster.
    ssh kki
    cd /cluster/data/monDom1/bed/zb.galGal2
    # script copied over from /cluster/data/galGal2/jkStuff/BlastZ_run1.sh
    /cluster/data/monDom1/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# Completed: 151 of 151 jobs
# CPU time in finished jobs:       1131s      18.85m     0.31h    0.01d  0.000 y
# IO & Wait Time:                  2130s      35.50m     0.59h    0.02d  0.000 y
# Average job time:                  22s       0.36m     0.01h    0.00d
# Longest job:                       95s       1.58m     0.03h    0.00d
# Submission to last job:           490s       8.17m     0.14h    0.01d

    # third run: lav -> axt (an I/O bound process, run only on fileserver)
    #	Use the chunks.2bit so lavToAxt will work. Then lift the resulting axt

    ssh eieio
    cd /cluster/data/monDom1/bed/zb.galGal2
    mkdir axtChrom pslChrom

    cat << '_EOF_' > lav2axt.sh
#!/bin/sh
for L in lav/chr*
do
    chr=${L/lav\//}
    cd ${L}
    echo -n "${L} -> ${chr}.axt working ... "
    cat `ls -1 *.lav | sort -g` \
	| lavToAxt stdin \
	/iscratch/i/galGal2/nib \
	/iscratch/i/monDom1/chunks.2bit stdout \
	| axtSort stdin stdout | liftUp -axtQ ../../axtChrom/${chr}.axt \
	    /iscratch/i/monDom1/chunkedScaffolds.lft warn stdin
    cd ../..
    echo -n "${chr}.axt -> ${chr}.psl working ..."
    axtToPsl axtChrom/${chr}.axt S1.len S2.len pslChrom/${chr}.psl
    echo "DONE"
done
'_EOF_'
    # << this line keeps emacs coloring happy

    chmod +x lav2axt.sh
    ./lav2axt.sh
    #	12 minutes

    #	load this blast result just to see how it looks
    #	It is not needed in the browser after the chains and nets are
    #	made, drop it later.
    ssh hgwdev
    cd /cluster/data/monDom1/bed/zb.galGal2/pslChrom

    hgLoadPsl -table=blastzMonDom1 -noTNameIx galGal2 chrM.psl
    bash			#	if you are normally csh
    for I in `ls *.psl | grep -v chrM.psl`
    do
        /cluster/bin/i386/hgLoadPsl -table=blastzMonDom1 -append \
		-noTNameIx galGal2 ${I}
        echo "done: ${I}"
    done
    exit		#	exit bash if you are normally csh
    #	7 minutes

    #	Let's see what featureBits has to say about this data
    featureBits galGal2 blastzMonDom1
    102241492 bases of 1054197620 (9.699%) in intersection
    featureBits galGal2 blastzBestHg16
    97272784 bases of 1054197620 (9.227%) in intersection


# CHAIN CHICKEN BLASTZ  (DONE - 2004-12-12 - Hiram)
    # Run axtChain on file server, job is mostly I/O, cluster does not help
    ssh eieio
    cd /cluster/data/monDom1/bed/zb.galGal2
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chainRaw

    ls -1S /cluster/data/monDom1/bed/zb.galGal2/axtChrom/*.axt > input.lst
    cat << '_EOF_' > doChain.sh
#!/bin/sh
for chrAxtFile in `cat input.lst`
do
    chr=`basename ${chrAxtFile}`
    axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
	-linearGap=../../opossumChickenTuned.gap \
	-minScore=5000 ${chrAxtFile} \
	/cluster/data/galGal2/nib \
	/cluster/data/monDom1/monDom1.2bit chainRaw/${chr}.chain > out/${chr}.out
done
'_EOF_'
    # << this line keeps emacs coloring happy

    # Reuse gap penalties from chicken run.
    cat << '_EOF_' | sed 's/  */\t/g' > ../../opossumChickenTuned.gap
tablesize	11
smallSize	111
position	1	2	3	11	111	2111	12111	32111	72111	152111	252111
qGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
tGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
bothGap	625	660	700	750	900	1400	4000	8000	16000	32000	57000
'_EOF_'
    # << this line keeps emacs coloring happy

    chmod +x doChain
    time ./doChain
    #	42 minutes

    # On the file server weed the chains for repeats
    ssh eieio
    cd /cluster/data/monDom1/bed/zb.galGal2/axtChain
    mkdir chain
    cd run1/chainRaw
    bash			# if you are normally csh
    for i in *.chain
    do
        chainAntiRepeat /cluster/data/galGal2/nib \
		/cluster/data/monDom1/monDom1.2bit $i ../../chain/$i
	echo "$i done"
    done
    #	7 minutes
    exit			# exit bash if you are normally csh

    # now on the file server, sort chains.  The chainMergeSort and
    # chainSplit steps each take less than a minute
    ssh eieio
    cd /cluster/data/monDom1/bed/zb.galGal2/axtChain
    chainMergeSort chain/*.chain > all.chain
    mv chain chainBeforeMerge
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # Load chains into database.  This takes an hour
    ssh hgwdev
    cd /cluster/data/monDom1/bed/zb.galGal2/axtChain/chain
    bash		#	if you are csh normally
    for i in *.chain
    do
	c=${i/.chain/}
	echo "loading ${c}"
	hgLoadChain galGal2 ${c}_chainMonDom1 $i
    done
    #	4 minutes
    exit		#	exit bash if you are csh normally

    #	Compare to some other organisms
    featureBits galGal2 chainMonDom1
    #	782212657 bases of 1054197620 (74.200%) in intersection
    featureBits galGal2 chainHg17
    #	921410328 bases of 1054197620 (87.404%) in intersection
    featureBits galGal2 chainXenTro1
    #	691628520 bases of 1054197620 (65.607%) in intersection
    featureBits galGal2 chainMm5
    #	846905330 bases of 1054197620 (80.336%) in intersection

    #	create a copy of these things for hgdownload  (2005-01-12 - Hiram)
    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/galGal2/vsMonDom1
    cd /cluster/data/monDom1/bed/zb.galGal2/axtChain/chain
    tar cvzf \
      /usr/local/apache/htdocs/goldenPath/galGal2/vsMonDom1/opossum.chain.tgz \
	./chr*.chain

# NET OPOSSUM TO CHICKEN BLASTZ (DONE - 2004-12-13 - Hiram)
    ssh eieio
    cd /cluster/data/monDom1/bed/zb.galGal2/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net
    #	2 minutes

    # Create axt files for the net as so:
    netSplit noClass.net noClass
    #	1 minute
    cd noClass
    mkdir ../../axtNet
    bash		#	if you are csh normally
    for i in *.net
    do
	r=${i/.net/}
	netToAxt $i ../chain/${r}.chain /cluster/data/galGal2/nib \
		/cluster/data/monDom1/monDom1.2bit ../../axtNet/${r}.axt
    done
    #	6 minutes
    exit		#	back to csh if you are csh normally

    cd ..
    
    # Add classification info using db tables: 
    ssh hgwdev
    cd /cluster/data/monDom1/bed/zb.galGal2/axtChain
    netClass -noAr noClass.net galGal2 monDom1 chicken.net
    #	56 minutes

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/monDom1/bed/zb.galGal2/axtChain
    netFilter -minGap=10 chicken.net |  hgLoadNet galGal2 netMonDom1 stdin
    #	4 minutes
    #	compare featureBits with some other assemblies
    featureBits galGal2 netMonDom1
    #	771045333 bases of 1054197620 (73.140%) in intersection
    featureBits galGal2 netHg17
    #	908340945 bases of 1054197620 (86.164%) in intersection
    featureBits galGal2 netMm5
    #	835277984 bases of 1054197620 (79.234%) in intersection
    featureBits galGal2 netXenTro1
    #	674438463 bases of 1054197620 (63.976%) in intersection

    #	create a copy of these things for hgdownload  (2005-01-12 - Hiram)
    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/galGal2/vsMonDom1
    cd /cluster/data/monDom1/bed/zb.galGal2/axtChain
    cp -p chicken.net \
	/usr/local/apache/htdocs/goldenPath/galGal2/vsMonDom1/opossum.net
    cd /usr/local/apache/htdocs/goldenPath/galGal2/vsMonDom1
    gzip opossum.net


# SWAP EVERYTHING AND REMAKE NET FROM OPOSSUM REFERENCE
#	Create CHICKEN TO OPOSSUM chain, net
    ssh eieio
    cd /cluster/data/monDom1/bed
    mkdir -p bz.galGal2/axtChain
    chainSwap zb.galGal2/axtChain/all.chain bz.galGal2/axtChain/all.chain
    #	1 minute

    cd bz.galGal2/axtChain
    chainPreNet all.chain ../../zb.galGal2/S2.len \
		../../zb.galGal2/S1.len stdout \
	| chainNet stdin -minSpace=1 ../../zb.galGal2/S2.len \
		../../zb.galGal2/S1.len stdout /dev/null \
	| netSyntenic stdin noClass.net
    #	2 minutes
    mkdir ../axtNet
    netToAxt noClass.net all.chain /cluster/data/monDom1/monDom1.2bit \
	/cluster/data/galGal2/nib ../axtNet/all.axt
    #	6 minutes

    # Classify net and load into database
    ssh hgwdev
    cd /cluster/data/monDom1/bed/bz.galGal2/axtChain
    netClass -noAr noClass.net monDom1 galGal2 monDom1.net
    #	31 minutes
    netFilter -minGap=10 monDom1.net | hgLoadNet monDom1 netGalGal2 stdin
    #	4 minutes

    # Load chains into database (takes about 4 hours).
    hgLoadChain -tIndex monDom1 chainGalGal2 all.chain
    # if you do a 'show index' SQL reports Cardinality NULL for the tName
    #	indexes although they are actually working.   To make the
    #	cardinality show up:
    hgsql monDom1 -e "analyze table chainGalGal2;"
    hgsql monDom1 -e "analyze table chainGalGal2Link;"

    featureBits monDom1 chainGalGal2Link
    #	110933245 bases of 3492108230 (3.177%) in intersection
    featureBits monDom1 chainGalGal2
    #	2437546469 bases of 3492108230 (69.802%) in intersection

    #	create a copy of for hgdownload  (2005-01-13 - Hiram)
    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/monDom1/vsGalGal2
    cd /cluster/data/monDom1/bed/bz.galGal2/axtChain
    gzip --stdout all.chain > \
	/usr/local/apache/htdocs/goldenPath/monDom1/vsGalGal2/chicken.chain.gz
    gzip --stdout monDom1.net > \
	/usr/local/apache/htdocs/goldenPath/monDom1/vsGalGal2/chicken.net.gz

# BLASTZ CHICKEN CLEAN UP (DONE - 2005-01-12 - Hiram)
    ssh eieio
    cd /cluster/data/monDom1/bed/zb.galGal2/axtChain
    nice rm -rf run1/out run1/chainRaw chain/chain.tab chain/link.tab \
	noClass.net chainBeforeMerge noClass &
    nice gzip chain/* &
    nice gzip chicken.net &
    nice gzip all.chain &

    cd /cluster/data/monDom1/bed/zb.galGal2
    nice rm -rf raw &
    nice gzip axtNet/*.axt &
    nice gzip axtChrom/*.axt &
    nice gzip pslChrom/* lav/*/* &

    cd /cluster/data/monDom1/bed/bz.galGal2/axtChain
    nice rm -f noClass.net link.tab chain.tab &
    nice gzip all.chain monDom1.net &
    cd ../axtNet
    nice gzip all.axt &

# ALIGN TO MOUSE (DONE - 2004-12-16 - Hiram)
#	NAMING CONVENTION: the zb.mm5 directory will be the Opossum
#	aligned to the Mouse blastz result.
#	The bz.mm5 directory will the swapped result, Mouse aligned
#	to Opossum 
  ssh kk
  mkdir -p /cluster/data/monDom1/bed/zb.mm5
  cd /cluster/data/monDom1/bed/zb.mm5

  # Create DEF file for run of scaffolds vs. mouse chromosomes
  cat << '_EOF_' > DEF
# mouse vs. M.domestica
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
# Set up blastz parameters using parameters between (chicken and fish ?)
# but not abridging repeats since can't do that with scaffolds, and
# it's not very relevant at this evolutionary distance.
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: mouse
SEQ1_DIR=/scratch/mus/mm5/softNib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Opossum
SEQ2_DIR=/iscratch/i/monDom1/chunks
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/monDom1/bed/zb.mm5

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
  # << this line keeps emacs coloring happy

    cp -p /cluster/data/mm5/chrom.sizes S1.len
    cp -p /cluster/data/monDom1/chrom.sizes S2.len
    # script copied over from /cluster/data/mm5/jkStuff/BlastZ_run0.sh
    /cluster/data/monDom1/jkStuff/BlastZ_run0.sh

    cd run.0
    para try, push check, para push, para check....
# Completed: 112871 of 112871 jobs
# CPU time in finished jobs:   36169619s  602826.99m 10047.12h  418.63d  1.147 y
# IO & Wait Time:                452242s    7537.36m   125.62h    5.23d  0.014 y
# Average job time:                 324s       5.41m     0.09h    0.00d
# Longest job:                     2755s      45.92m     0.77h    0.03d
# Submission to last job:        153519s    2558.65m    42.64h    1.78d
    #	extra delay in this run due to disk problems in the middle and
    #	thus being interrupted

    #	Second cluster run to convert the .out's to .lav's
    #	You do NOT want to run this on the big cluster.  It brings
    #	the file server to its knees.  Run this on the small cluster.
    ssh kki
    cd /cluster/data/monDom1/bed/zb.mm5
    # script copied over from /cluster/data/mm5/jkStuff/BlastZ_run1.sh
    /cluster/data/monDom1/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# Completed: 341 of 341 jobs
# CPU time in finished jobs:       3131s      52.19m     0.87h    0.04d  0.000 y
# IO & Wait Time:                  9815s     163.58m     2.73h    0.11d  0.000 y
# Average job time:                  38s       0.63m     0.01h    0.00d
# Longest job:                      223s       3.72m     0.06h    0.00d
# Submission to last job:          1267s      21.12m     0.35h    0.01d

    # third run: lav -> axt (an I/O bound process, run only on fileserver)
    #	Use the chunks.2bit so lavToAxt will work. Then lift the resulting axt

    ssh eieio
    cd /cluster/data/monDom1/bed/zb.mm5
    mkdir axtChrom pslChrom

    cat << '_EOF_' > lav2axt.sh
#!/bin/sh
for L in lav/chr*
do
    chr=${L/lav\//}
    cd ${L}
    echo -n "${L} -> ${chr}.axt working ... "
    cat `ls -1 *.lav | sort -g` \
	| lavToAxt stdin \
	/scratch/mus/mm5/softNib \
	/iscratch/i/monDom1/chunks.2bit stdout \
	| axtSort stdin stdout | liftUp -axtQ ../../axtChrom/${chr}.axt \
	    /iscratch/i/monDom1/chunkedScaffolds.lft warn stdin
    cd ../..
    echo -n "${chr}.axt -> ${chr}.psl working ..."
    axtToPsl axtChrom/${chr}.axt S1.len S2.len pslChrom/${chr}.psl
    echo "DONE"
done
'_EOF_'
    # << this line keeps emacs coloring happy

    chmod +x lav2axt.sh
    ./lav2axt.sh
    #	38 minutes

    #	load this blast result just to see how it looks
    #	It is not needed in the browser after the chains and nets are
    #	made, drop it later.
    ssh hgwdev
    cd /cluster/data/monDom1/bed/zb.mm5/pslChrom
    hgLoadPsl -table=blastzMonDom1 -noTNameIx mm5 chrM.psl
    bash		#	if you are csh normally
    for I in `ls *.psl | grep -v chrM.psl`
    do
        /cluster/bin/i386/hgLoadPsl -table=blastzMonDom1 -append \
		-noTNameIx mm5 ${I}
        echo "done: ${I}"
    done
    #	12 minutes
    exit		#	back to csh if you are csh normally
    #	Let's see what featureBits has to say about this data
    featureBits mm5 blastzMonDom1
    263272041 bases of 2615483787 (10.066%) in intersection

# CHAIN MOUSE BLASTZ  (TBD)
    # Run axtChain on file server, job is mostly I/O, cluster does not help
    ssh eieio
    cd /cluster/data/monDom1/bed/zb.mm5
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chainRaw

    ls -1S /cluster/data/monDom1/bed/zb.mm5/axtChrom/*.axt > input.lst
    cat << '_EOF_' > doChain.sh
#!/bin/sh
for chrAxtFile in `cat input.lst`
do
    chr=`basename ${chrAxtFile}`
    axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
	-linearGap=../../opossumMouseTuned.gap \
	-minScore=5000 ${chrAxtFile} \
	/scratch/mus/mm5/softNib \
	/cluster/data/monDom1/monDom1.2bit chainRaw/${chr}.chain > out/${chr}.out
done
'_EOF_'
    # << this line keeps emacs coloring happy

    # Reuse gap penalties from mouse run.
    cat << '_EOF_' | sed 's/  */\t/g' > ../../opossumMouseTuned.gap
tablesize	11
smallSize	111
position	1	2	3	11	111	2111	12111	32111	72111	152111	252111
qGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
tGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
bothGap	625	660	700	750	900	1400	4000	8000	16000	32000	57000
'_EOF_'
    # << this line keeps emacs coloring happy

    chmod +x doChain
    time ./doChain
    #	95 minutes

    # On the file server weed the chains for repeats
    ssh eieio
    cd /cluster/data/monDom1/bed/zb.mm5/axtChain
    mkdir chain
    cd run1/chainRaw
    bash			# if you are normally csh
    for i in *.chain
    do
        chainAntiRepeat /scratch/mus/mm5/softNib \
		/cluster/data/monDom1/monDom1.2bit $i ../../chain/$i
	echo "$i done"
    done
    #	34 minutes
    exit			# exit bash if you are normally csh

    # now on the file server, sort chains. 
    # chainSplit steps each take less than a minute
    ssh eieio
    cd /cluster/data/monDom1/bed/zb.mm5/axtChain
    chainMergeSort chain/*.chain > all.chain
    #	3 minutes
    mv chain chainBeforeMerge
    chainSplit chain all.chain
    #	3 minutes
    rm run1/chain/*.chain

    # Load chains into database.  This takes an hour
    ssh hgwdev
    cd /cluster/data/monDom1/bed/zb.mm5/axtChain/chain
    bash		#	if you are csh normally
    for i in *.chain
    do
	c=${i/.chain/}
	echo "loading ${c}"
	hgLoadChain mm5 ${c}_chainMonDom1 $i
    done
    #	20 minutes
    exit		#	exit bash if you are csh normally

    #	Compare to some other organisms
    featureBits mm5 chainMonDom1
    #	2121448151 bases of 2615483787 (81.111%) in intersection
    featureBits -countGaps mm5 chainRn3
    #	2646682349 bases of 3164952073 (83.625%) in intersection
    featureBits mm5 chainRn3
    #	2646682349 bases of 2615483787 (101.193%) in intersection
    featureBits mm5 chainHg17
    #	2507720521 bases of 2615483787 (95.880%) in intersection
    featureBits mm5 chainXenTro1
    #	1078618413 bases of 2615483787 (41.240%) in intersection

    #	create a copy of these things for hgdownload  (2005-01-12 - Hiram)
    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/vsMonDom1
    cd /cluster/data/monDom1/bed/zb.mm5/axtChain/chain
    tar cvzf \
	/usr/local/apache/htdocs/goldenPath/mm5/vsMonDom1/opossum.chain.tgz \
	./chr*.chain

# NET OPOSSUM TO MOUSE BLASTZ (TBD)
    ssh eieio
    cd /cluster/data/monDom1/bed/zb.mm5/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
	| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
	| netSyntenic stdin noClass.net
    #	6 minutes

    # Create axt files for the net as so:
    netSplit noClass.net noClass
    #	1 minute
    cd noClass
    mkdir ../../axtNet
    bash		#	if you are csh normally
    for i in *.net
    do
	r=${i/.net/}
	netToAxt $i ../chain/${r}.chain /scratch/mus/mm5/softNib \
		/cluster/data/monDom1/monDom1.2bit ../../axtNet/${r}.axt
    done
    #	10 minutes
    exit		#	back to csh if you are csh normally

    cd ..
    rm -r noClass
    
    # Add classification info using db tables: 
    ssh hgwdev
    cd /cluster/data/monDom1/bed/zb.mm5/axtChain
    netClass -noAr noClass.net mm5 monDom1 mouse.net
    #	41 minutes

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/monDom1/bed/zb.mm5/axtChain
    netFilter -minGap=10 mouse.net |  hgLoadNet mm5 netMonDom1 stdin
    #	3 minutes
    #	compare featureBits with some other assemblies
    featureBits mm5 netMonDom1
    #	2094316044 bases of 2615483787 (80.074%) in intersection
    featureBits mm5 netRn3
    #	2638255333 bases of 2615483787 (100.871%) in intersection
    featureBits -countgaps mm5 netRn3
    #	2638255333 bases of 3164952073 (83.358%) in intersection
    featureBits mm5 netHg17
    #	2504056038 bases of 2615483787 (95.740%) in intersection
    featureBits mm5 netCanFam1
    #	2456773441 bases of 2615483787 (93.932%) in intersection
    featureBits mm5 netGalGal2
    #	1958796258 bases of 2615483787 (74.892%) in intersection
    featureBits mm5 netXenTro1
    #	1042210258 bases of 2615483787 (39.848%) in intersection
    featureBits mm5 netTetNig1
    #	618111072 bases of 2615483787 (23.633%) in intersection
    featureBits mm5 netDanRer2
    #	560950601 bases of 2615483787 (21.447%) in intersection

    #	create a copy of these things for hgdownload  (2005-01-12 - Hiram)
    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/vsMonDom1
    cd /cluster/data/monDom1/bed/zb.mm5/axtChain
    cp -p mouse.net \
	/usr/local/apache/htdocs/goldenPath/mm5/vsMonDom1/opossum.net
    cd /usr/local/apache/htdocs/goldenPath/mm5/vsMonDom1
    gzip opossum.net

# SWAP EVERYTHING AND REMAKE NET FROM OPOSSUM REFERENCE
#	Create MOUSE TO OPOSSUM chain, net
    ssh eieio
    cd /cluster/data/monDom1/bed
    mkdir -p bz.mm5/axtChain
    chainSwap zb.mm5/axtChain/all.chain bz.mm5/axtChain/all.chain
    #	2 minutes

    cd bz.mm5/axtChain
    chainPreNet all.chain ../../zb.mm5/S2.len \
		../../zb.mm5/S1.len stdout \
	| chainNet stdin -minSpace=1 ../../zb.mm5/S2.len \
		../../zb.mm5/S1.len stdout /dev/null \
	| netSyntenic stdin noClass.net
    #	6 minutes
    mkdir ../axtNet
    netToAxt noClass.net all.chain /cluster/data/monDom1/monDom1.2bit \
	/scratch/mus/mm5/softNib ../axtNet/all.axt
    #	9 minutes

    # Classify net and load into database
    ssh hgwdev
    cd /cluster/data/monDom1/bed/bz.mm5/axtChain
    netClass -noAr noClass.net monDom1 mm5 monDom1.net
    #	13 minutes
    netFilter -minGap=10 monDom1.net | hgLoadNet monDom1 netMm5 stdin
    #	3 minutes

    # Load chains into database (takes about 4 hours).
    hgLoadChain -tIndex monDom1 chainMm5 all.chain
    # if you do a 'show index' SQL reports Cardinality NULL for the tName
    #	indexes although they are actually working.   To make the
    #	cardinality show up:
    hgsql monDom1 -e "analyze table chainMm5;"
    hgsql monDom1 -e "analyze table chainMm5Link;"

    featureBits monDom1 chainMm5Link
    #	249594220 bases of 3492108230 (7.147%) in intersection
    featureBits monDom1 chainMm5
    #	2913812625 bases of 3492108230 (83.440%) in intersection
    featureBits monDom1 netMm5
    #	2889580530 bases of 3492108230 (82.746%) in intersection

    #	create a copy of for hgdownload  (2005-01-13 - Hiram)
    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/monDom1/vsMm5
    cd /cluster/data/monDom1/bed/bz.mm5/axtChain
    gzip --stdout all.chain > \
	/usr/local/apache/htdocs/goldenPath/monDom1/vsMm5/mouse.chain.gz
    gzip --stdout monDom1.net > \
	/usr/local/apache/htdocs/goldenPath/monDom1/vsMm5/mouse.net.gz

# BLASTZ MOUSE CLEAN UP (DONE - 2005-01-12 - Hiram)
    ssh eieio
    cd /cluster/data/monDom1/bed/zb.mm5/axtChain
    nice rm -rf run1/out run1/chainRaw chain/chain.tab chain/link.tab \
	noClass.net chainBeforeMerge noClass &
    nice gzip chain/* &
    nice gzip mouse.net &
    nice gzip all.chain &

    cd /cluster/data/monDom1/bed/zb.mm5
    nice rm -rf raw &
    nice gzip axtNet/*.axt &
    nice gzip axtChrom/*.axt &
    nice gzip pslChrom/* lav/*/* &

    cd /cluster/data/monDom1/bed/bz.mm5/axtChain
    nice rm -f noClass.net link.tab chain.tab &
    nice gzip all.chain monDom1.net &
    cd ../axtNet
    nice gzip all.axt &

# MAKE Human Proteins track (hg17 DONE 2004-12-15 braney)
    ssh eieio
    mkdir -p /cluster/data/monDom1/blastDb
    cd /cluster/data/monDom1/blastDb
    faSplit sequence ../broad.mit.edu/scaffolds.fa.unmasked 500 x 
    for i in x*
    do
	formatdb -i $i -p F
    done
    rm *.log *.fa

    ssh kkr1u00
    mkdir -p /iscratch/i/monDom1/blastDb
    cp /cluster/data/monDom1/blastDb/* /iscratch/i/monDom1/blastDb
    (iSync) > sync.out
    exit # back to eieio

    mkdir -p /cluster/data/monDom1/bed/tblastn.hg17KG
    cd /cluster/data/monDom1/bed/tblastn.hg17KG
    ls -1S /iscratch/i/monDom1/blastDb/*.nsq | sed "s/\.nsq//" > query.lst

    calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(350000/`wc query.lst | awk "{print \\\$1}"`\)
    # 42156/(350000/499) = 60.102411
    mkdir -p /cluster/bluearc/monDom1/bed/tblastn.hg17KG/kgfa
    split -l 65 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/monDom1/bed/tblastn.hg17KG/kgfa/kg
    ln -s /cluster/bluearc/monDom1/bed/tblastn.hg17KG/kgfa kgfa
    cd kgfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.2
	if pslCheck -prot $3.tmp
	then
	    mv $3.tmp $3
	    rm -f $f.1 $f.2 
	fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/monDom1/bed/tblastn.hg17KG
    para create blastSpec
    para push
# Completed: 323851 of 323851 jobs
# CPU time in finished jobs:  113015986s 1883599.76m 31393.33h 1308.06d  3.584 y
# IO & Wait Time:               2732896s   45548.27m   759.14h   31.63d  0.087 y
# Average job time:                 357s       5.96m     0.10h    0.00d
# Longest job:                     4911s      81.85m     1.36h    0.06d
# Submission to last job:        274144s    4569.07m    76.15h    3.17d

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    ssh kki
    cd /cluster/data/monDom1/bed/tblastn.hg17KG
    para create chainSpec
    para push
# Completed: 649 of 649 jobs
# CPU time in finished jobs:       4726s      78.76m     1.31h    0.05d  0.000 y
# IO & Wait Time:                 76824s    1280.41m    21.34h    0.89d  0.002 y
# Average job time:                 126s       2.09m     0.03h    0.00d
# Longest job:                      312s       5.20m     0.09h    0.00d
# Submission to last job:          7487s     124.78m     2.08h    0.09d

    exit
    # back to eieio
    cd /cluster/data/monDom1/bed/tblastn.hg17KG/blastOut
    for i in kg??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n  | uniq  > /cluster/data/monDom1/bed/tblastn.hg17KG/blastHg17KG.psl
    cd ..

    ssh hgwdev
    cd /cluster/data/monDom1/bed/tblastn.hg17KG
    hgLoadPsl monDom1 blastHg17KG.psl

    # back to eieio
    rm -rf blastOut
# End tblastn

# MAKE Human Known Gene mRNAs Mapped by BLATZ track (hg17 DONE 2005-1-14 JK)

    # Create working directory and extract known gene mrna from database
    ssh hgwdev
    cd /cluster/data/monDom1/bed
    mkdir blatz.hg17KG
    cd blatz.hg17KG
    pepPredToFa hg17 knownGeneMrna known.fa

    # Split known genes into pieces and load on /iscratch/i temp device
    ssh kkr1u00
    cd /iscratch/i/hg17
    mkdir knownRna
    cd knownRna
    faSplit sequence /cluster/data/monDom1/bed/blatz.hg17KG/known.fa 9 known
    iSync
    rm /cluster/data/monDom1/bed/blatz.hg17KG/known.fa

    # Do parasol job to align via blatz
    ssh kk
    cd /cluster/data/monDom1/bed/blatz.hg17KG
    mkdir run0
    cd run0
    mkdir psl
    awk '{print $1;}' /cluster/data/monDom1/chrom.sizes > genome.lst
    ls -1 /iscratch/i/hg17/knownRna/*.fa > mrna.lst
    cat << '_EOF_' > gsub
#LOOP
blatz -rna -minScore=3000 /scratch/monDom1/monDom1.2bit:$(file1) $(path2) -out=psl {check out line psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    # Note, I recommend -minScore=6000 next time.
    gensub2 genome.lst mrna.lst gsub spec
    para create spec
    # Then do usual para try/para push/para time
Completed: 174132 of 174132 jobs
CPU time in finished jobs:    9399737s  156662.29m  2611.04h  108.79d  0.298 y
IO & Wait Time:               2143618s   35726.96m   595.45h   24.81d  0.068 y
Average job time:                  66s       1.10m     0.02h    0.00d
Longest job:                     2437s      40.62m     0.68h    0.03d
Submission to last job:         13892s     231.53m     3.86h    0.16d

    # Do sort and near-besting on file server
    ssh eieio
    cd /cluster/data/monDom1/bed/blatz.hg17KG/run0
    catDir psl | pslFilter -minScore=100 -minAli=255 -noHead stdin stdout | sort -k 10 > ../filtered.psl
    cd ..
    pslReps filtered.psl nearBest.psl /dev/null -nohead -minAli=0.5 -nearTop=0.01 -minCover=0.255
    sort -k 14 nearBest.psl > blatzHg17KG.psl

    # Clean up
    rm -r run0/psl 

    # Load into database
    ssh hgwdev
    cd /cluster/data/monDom1/bed/blatz.hg17KG
    hgLoadPsl monDom1 blatzHg17KG.psl 


# ALIGN TO ZEBRAFISH (WORKING - 2004-12-17 - Hiram)
#	NAMING CONVENTION: the zb.danRer2 directory will be the Opossum
#	aligned to the Zebrafish blastz result.
#	The bz.danRer2 directory will the swapped result, Zebrafish aligned
#	to Opossum 
  ssh kk
  mkdir -p /cluster/data/monDom1/bed/zb.danRer2
  cd /cluster/data/monDom1/bed/zb.danRer2

  # Create DEF file for run of scaffolds vs. zebrafish chromosomes
  cat << '_EOF_' > DEF
# zebrafish vs. M.domestica
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
# Set up blastz parameters using parameters between (chicken and fish ?)
# but not abridging repeats since can't do that with scaffolds, and
# it's not very relevant at this evolutionary distance.
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: zebrafish
SEQ1_DIR=/iscratch/i/danRer2/nib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Opossum
SEQ2_DIR=/iscratch/i/monDom1/chunks
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/monDom1/bed/zb.danRer2

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
  # << this line keeps emacs coloring happy

    cp -p /cluster/data/danRer2/chrom.sizes S1.len
    cp -p /cluster/data/monDom1/chrom.sizes S2.len
    # script copied over from /cluster/data/danRer2/jkStuff/BlastZ_run0.sh
    /cluster/data/monDom1/jkStuff/BlastZ_run0.sh

    cd run.0
    para try, push check, para push, para check....
# Completed: 57263 of 57263 jobs
# CPU time in finished jobs:   20015224s  333587.07m  5559.78h  231.66d  0.635 y
# IO & Wait Time:                364057s    6067.61m   101.13h    4.21d  0.012 y
# Average job time:                 356s       5.93m     0.10h    0.00d
# Longest job:                     2681s      44.68m     0.74h    0.03d
# Submission to last job:        117505s    1958.42m    32.64h    1.36d

    #	Second cluster run to convert the .out's to .lav's
    #	You do NOT want to run this on the big cluster.  It brings
    #	the file server to its knees.  Run this on the small cluster.
    ssh kki
    cd /cluster/data/monDom1/bed/zb.danRer2
    # script copied over from /cluster/data/danRer2/jkStuff/BlastZ_run1.sh
    /cluster/data/monDom1/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
Completed: 173 of 173 jobs
CPU time in finished jobs:       1228s      20.47m     0.34h    0.01d  0.000 y
IO & Wait Time:                  4828s      80.46m     1.34h    0.06d  0.000 y
Average job time:                  35s       0.58m     0.01h    0.00d
Longest job:                       49s       0.82m     0.01h    0.00d
Submission to last job:           393s       6.55m     0.11h    0.00d

    # third run: lav -> axt (an I/O bound process, run only on fileserver)
    #	Use the chunks.2bit so lavToAxt will work. Then lift the resulting axt

    ssh eieio
    cd /cluster/data/monDom1/bed/zb.danRer2
    mkdir axtChrom pslChrom

    cat << '_EOF_' > lav2axt.sh
#!/bin/sh
for L in lav/chr*
do
    chr=${L/lav\//}
    cd ${L}
    echo -n "${L} -> ${chr}.axt working ... "
    cat `ls -1 *.lav | sort -g` \
	| lavToAxt stdin \
	/iscratch/i/danRer2/nib \
	/iscratch/i/monDom1/chunks.2bit stdout \
	| axtSort stdin stdout | liftUp -axtQ ../../axtChrom/${chr}.axt \
	    /iscratch/i/monDom1/chunkedScaffolds.lft warn stdin
    cd ../..
    echo -n "${chr}.axt -> ${chr}.psl working ..."
    axtToPsl axtChrom/${chr}.axt S1.len S2.len pslChrom/${chr}.psl
    echo "DONE"
done
'_EOF_'
    # << this line keeps emacs coloring happy

    chmod +x lav2axt.sh
    ./lav2axt.sh
    #	6 minutes

    #	load this blast result just to see how it looks
    #	It is not needed in the browser after the chains and nets are
    #	made, drop it later.
    ssh hgwdev
    cd /cluster/data/monDom1/bed/zb.danRer2/pslChrom
    hgLoadPsl -table=blastzMonDom1 -noTNameIx danRer2 chrM.psl
    bash		#	if you are csh normally
    for I in `ls *.psl | grep -v chrM.psl`
    do
        /cluster/bin/i386/hgLoadPsl -table=blastzMonDom1 -append \
		-noTNameIx danRer2 ${I}
        echo "done: ${I}"
    done
    #	2 minutes
    exit		#	back to csh if you are csh normally
    #	Let's see what featureBits has to say about this data
    featureBits danRer2 blastzMonDom1
    #	56615212 bases of 1560497282 (3.628%) in intersection

# CHAIN ZEBRAFISH BLASTZ  (DONE - 2004-12-19 - Hiram))
    # Run axtChain on file server, job is mostly I/O, cluster does not help
    ssh eieio
    cd /cluster/data/monDom1/bed/zb.danRer2
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chainRaw

    ls -1S /cluster/data/monDom1/bed/zb.danRer2/axtChrom/*.axt > input.lst
    cat << '_EOF_' > doChain.sh
#!/bin/sh
for chrAxtFile in `cat input.lst`
do
    chr=`basename ${chrAxtFile}`
    axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
	-linearGap=../../opossumZFishTuned.gap \
	-minScore=5000 ${chrAxtFile} \
	/iscratch/i/danRer2/nib \
	/cluster/data/monDom1/monDom1.2bit chainRaw/${chr}.chain > out/${chr}.out
done
'_EOF_'
    # << this line keeps emacs coloring happy

    # Reuse gap penalties from mouse run.
    cat << '_EOF_' | sed 's/  */\t/g' > ../../opossumZFishTuned.gap
tablesize	11
smallSize	111
position	1	2	3	11	111	2111	12111	32111	72111	152111	252111
qGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
tGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
bothGap	625	660	700	750	900	1400	4000	8000	16000	32000	57000
'_EOF_'
    # << this line keeps emacs coloring happy

    chmod +x doChain.sh
    time ./doChain.sh
    #	57 minutes

    # On the file server weed the chains for repeats
    ssh eieio
    cd /cluster/data/monDom1/bed/zb.danRer2/axtChain
    mkdir chain
    cd run1/chainRaw
    bash			# if you are normally csh
    for i in *.chain
    do
        chainAntiRepeat /iscratch/i/danRer2/nib \
		/cluster/data/monDom1/monDom1.2bit $i ../../chain/$i
	echo "$i done"
    done
    #	4 minutes
    exit			# exit bash if you are normally csh

    # now on the file server, sort chains. 
    # chainSplit steps each take less than a minute
    ssh eieio
    cd /cluster/data/monDom1/bed/zb.danRer2/axtChain
    chainMergeSort chain/*.chain > all.chain
    #	1 minute
    mv chain chainBeforeMerge
    chainSplit chain all.chain
    #	1 minute
    rm run1/chain/*.chain

    # Load chains into database.  This takes an hour
    ssh hgwdev
    cd /cluster/data/monDom1/bed/zb.danRer2/axtChain/chain
    bash		#	if you are csh normally
    for i in *.chain
    do
	c=${i/.chain/}
	echo "loading ${c}"
	hgLoadChain danRer2 ${c}_chainMonDom1 $i
    done
    #	3 minutes
    exit		#	exit bash if you are csh normally

    #	Compare to some other organisms
    featureBits danRer2 chainMonDom1
    #	267969653 bases of 1560497282 (17.172%) in intersection
    featureBits danRer2 chainHg17
    #	510842993 bases of 1560497282 (32.736%) in intersection
    featureBits danRer2 chainMm5
    #	507912858 bases of 1560497282 (32.548%) in intersection
    featureBits danRer2 chainFr1
    #	309784483 bases of 1560497282 (19.852%) in intersection

    #	create a copy of these things for hgdownload  (2005-01-12 - Hiram)
    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/danRer2/vsMonDom1
    cd /cluster/data/monDom1/bed/zb.danRer2/axtChain/chain
    tar cvzf \
      /usr/local/apache/htdocs/goldenPath/danRer2/vsMonDom1/opossum.chain.tgz \
	./chr*.chain

# NET OPOSSUM TO ZEBRAFISH BLASTZ (TBD)
    ssh eieio
    cd /cluster/data/monDom1/bed/zb.danRer2/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
	| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
	| netSyntenic stdin noClass.net
    #	1 minute

    # Create axt files for the net as so:
    netSplit noClass.net noClass
    #	1 minute
    cd noClass
    mkdir ../../axtNet
    bash		#	if you are csh normally
    for i in *.net
    do
	r=${i/.net/}
	netToAxt $i ../chain/${r}.chain /iscratch/i/danRer2/nib \
		/cluster/data/monDom1/monDom1.2bit ../../axtNet/${r}.axt
    done
    #	4 minutes
    exit		#	back to csh if you are csh normally

    cd ..
    rm -r noClass
    
    # Add classification info using db tables: 
    ssh hgwdev
    cd /cluster/data/monDom1/bed/zb.danRer2/axtChain
    netClass -noAr noClass.net danRer2 monDom1 zfish.net
    #	39 minutes

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/monDom1/bed/zb.danRer2/axtChain
    netFilter -minGap=10 zfish.net |  hgLoadNet danRer2 netMonDom1 stdin
    #	1 minute
    #	compare featureBits with some other assemblies
    featureBits danRer2 netMonDom1
    #	251706312 bases of 1560497282 (16.130%) in intersection
    featureBits danRer2 netRn3
XXX - TBD - 2004-12-19 20:15
    #	2638255333 bases of 2615483787 (100.871%) in intersection
    featureBits -countgaps danRer2 netRn3
    #	2638255333 bases of 3164952073 (83.358%) in intersection
    featureBits danRer2 netHg17
    #	2504056038 bases of 2615483787 (95.740%) in intersection
    featureBits danRer2 netCanFam1
    #	2456773441 bases of 2615483787 (93.932%) in intersection
    featureBits danRer2 netGalGal2
    #	1958796258 bases of 2615483787 (74.892%) in intersection
    featureBits danRer2 netXenTro1
    #	1042210258 bases of 2615483787 (39.848%) in intersection
    featureBits danRer2 netTetNig1
    #	618111072 bases of 2615483787 (23.633%) in intersection
    featureBits danRer2 netDanRer2
    #	560950601 bases of 2615483787 (21.447%) in intersection

    #	create a copy of these things for hgdownload  (2005-01-12 - Hiram)
    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/danRer2/vsMonDom1
    cd /cluster/data/monDom1/bed/zb.danRer2/axtChain
    cp -p zfish.net \
	/usr/local/apache/htdocs/goldenPath/danRer2/vsMonDom1/opossum.net
    cd /usr/local/apache/htdocs/goldenPath/danRer2/vsMonDom1
    gzip opossum.net


# SWAP EVERYTHING AND REMAKE NET FROM OPOSSUM REFERENCE
#	Create ZEBRAFISH TO OPOSSUM chain, net
    ssh eieio
    cd /cluster/data/monDom1/bed
    mkdir -p bz.danRer2/axtChain
    chainSwap zb.danRer2/axtChain/all.chain bz.danRer2/axtChain/all.chain
    #	1 minutes

    cd bz.danRer2/axtChain
    chainPreNet all.chain ../../zb.danRer2/S2.len \
		../../zb.danRer2/S1.len stdout \
	| chainNet stdin -minSpace=1 ../../zb.danRer2/S2.len \
		../../zb.danRer2/S1.len stdout /dev/null \
	| netSyntenic stdin noClass.net
    #	6 minutes
    mkdir ../axtNet
    netToAxt noClass.net all.chain /cluster/data/monDom1/monDom1.2bit \
	/iscratch/i/danRer2/nib ../axtNet/all.axt
    #	5 minutes

    # Classify net and load into database
    ssh hgwdev
    cd /cluster/data/monDom1/bed/bz.danRer2/axtChain
    netClass -noAr noClass.net monDom1 danRer2 ZFish.net
    #	12 minutes
    netFilter -minGap=10 ZFish.net | hgLoadNet monDom1 netDanRer2 stdin
    #	1 minute

    # Load chains into database
    hgLoadChain -tIndex monDom1 chainDanRer2 all.chain
    #	10 minutes
    # if you do a 'show index' SQL reports Cardinality NULL for the tName
    #	indexes although they are actually working.   To make the
    #	cardinality show up:
    hgsql monDom1 -e "analyze table chainDanRer2;"
    hgsql monDom1 -e "analyze table chainDanRer2Link;"

    featureBits monDom1 chainDanRer2Link
    #	66821827 bases of 3492108230 (1.914%) in intersection
    featureBits monDom1 chainDanRer2
    #	397007345 bases of 3492108230 (11.369%) in intersection
    featureBits monDom1 chainGalGal2Link
    #	110933245 bases of 3492108230 (3.177%) in intersection
    featureBits monDom1 chainGalGal2
    #	2437546469 bases of 3492108230 (69.802%) in intersection
    featureBits monDom1 chainMm5Link

    #	create a copy of for hgdownload  (2005-01-13 - Hiram)
    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/monDom1/vsDanRer2
    cd /cluster/data/monDom1/bed/bz.danRer2/axtChain
    gzip --stdout all.chain > \
	/usr/local/apache/htdocs/goldenPath/monDom1/vsDanRer2/zebrafish.chain.gz
    gzip --stdout ZFish.net > \
	/usr/local/apache/htdocs/goldenPath/monDom1/vsDanRer2/zebrafish.net.gz

# BLASTZ ZEBRAFISH CLEAN UP (DONE - 2005-01-12 - Hiram)
    ssh eieio
    cd /cluster/data/monDom1/bed/zb.danRer2/axtChain
    nice rm -rf run1/out run1/chainRaw chain/chain.tab chain/link.tab \
	noClass.net chainBeforeMerge noClass &
    nice gzip chain/* &
    nice gzip zfish.net &
    nice gzip all.chain &

    cd /cluster/data/monDom1/bed/zb.danRer2
    nice rm -rf raw &
    nice gzip axtNet/*.axt &
    nice gzip axtChrom/*.axt &
    nice gzip pslChrom/* lav/*/* &

    cd /cluster/data/monDom1/bed/bz.danRer2/axtChain
    nice rm -f noClass.net link.tab chain.tab &
    nice gzip all.chain ZFish.net &
    cd ../axtNet
    nice gzip all.axt &

# 5-WAY VAR_MULTIZ - ALIGNMENTS Hg17, Mm5, MonDom1, GalGal2, DanRer2
#		DONE - 2005-01-04 - Hiram
    ssh eieio
    cd /cluster/data/monDom1/bed
    mkdir multiz
    bash		#	if you are csh normally
    #	if all.axt has been gzipped, simply use all.axt.gz
    #	It will work the same way
    for A in danRer2 galGal2 mm5 hg17
    do
      axtSort bz.${A}/axtNet/all.axt stdout | axtToMaf -tSplit stdin \
	 -tPrefix=monDom1. /cluster/data/monDom1/chrom.sizes \
	 -qPrefix=${A}. /cluster/data/${A}/chrom.sizes \
	 multiz/${A}
    done
    #	6 minutes
    exit		#	back to csh if you are csh normally

    #	Copy MAFs to Iservers for kluster run
    ssh kkr1u00
    mkdir /iscratch/i/monDom1/multiz
    cd /iscratch/i/monDom1/multiz
    rsync -a /cluster/data/monDom1/bed/multiz/ .
    mkdir penn
    cp -p /cluster/bin/penn/psuCVS/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/maf_project penn
    /cluster/bin/iSync

#       Progressive alignment up the tree w/o stager, 
#       using multiz.v10 (var_multiz)
#       Method: align internal subtrees (using 0 flag to var_multiz)
#               Then, align these to human (using 1 flag to var_multiz)
#       NOTE: must use maf_project after each multiz run, in order
#       to order output.  Single-cov guaranteed by use of net MAF's,
#       so it is not necessary to run single_cov2.

    ssh eieio
    cd /cluster/data/monDom1/bed/multiz

    #	Not sure what this is used for.  Appears to be from something
    #	that is no longer used.  Can find no reference to it in the
    #	procedures.  Below in the parameter estimation for phastCons
    #	there is a tree specified:
    #	((((hg17,mm5),galGal2),monDom1),danRer2)
    cat << '_EOF_' > tree.nh
(((hg17,mm5),galGal2),danRer2)
'_EOF_'
    # << this line keeps emacs coloring happy

    # make output dir and run dir
    ssh kk9
    cd /cluster/data/monDom1/bed/multiz
    mkdir -p maf
    mkdir -p run
    cd run

    # create scripts to run var_multiz on cluster

cat > oneMultiz.csh << 'EOF'
#!/bin/csh -fe
    set c = $1
    set multi = /scratch/monDom1/multiz5way.$c
    set pairs = /iscratch/i/monDom1/multiz

    # special mode --
    # with 1 arg, cleanup
    if ($#argv == 1) then
        rm -fr $multi
        exit
    endif

    # special mode --
    # with 3 args, saves an alignment file
    if ($#argv == 3) then
        cp $multi/$2/$c.maf $3
        exit
    endif
        
    set s1 = $2
    set s2 = $3
    set flag = $4

    # locate input files -- in pairwise dir, or multiple dir
    set d1 = $multi
    set d2 = $multi
    if (-d $pairs/$s1) then
        set d1 = $pairs
    endif
    if (-d $pairs/$s2) then
        set d2 = $pairs
    endif
    set f1 = $d1/$s1/$c.maf
    set f2 = $d2/$s2/$c.maf
    # write to output dir
    set out = $multi/${s1}${s2}
    mkdir -p $out

    # check for empty input file
    if (-s $f1 && -s $f2) then
        echo "Aligning $f1 $f2 $flag"
	/iscratch/i/monDom1/multiz/penn/multiz.v10 $f1 $f2 $flag $out/$c.unused1.maf $out/$c.unused2.maf > $out/$c.full.maf
        cat $out/$c.full.maf $out/$c.unused1.maf $out/$c.unused2.maf > \
                $out/$c.tmp.maf
        echo "Ordering $c.maf"
        /iscratch/i/monDom1/multiz/penn/maf_project $out/$c.tmp.maf monDom1.$c > $out/$c.maf
    else if (-s $f1) then
        cp $f1 $out
    else if (-s $f2) then
        cp $f2 $out
    endif
'EOF'
# << keep emacs coloring happy
chmod +x oneMultiz.csh

    #	Copy this script to iscratch
    ssh kkr1u00
    cd /iscratch/i/monDom1/multiz/penn
    cp -p /cluster/data/monDom1/bed/multiz/run/oneMultiz.csh .
    /cluster/bin/iSync

    #	back to run the job
    ssh kki
    cd /cluster/data/monDom1/bed/multiz/run


cat > allMultiz.csh << 'EOF'
#!/bin/csh -fe
    # multiple alignment steps:
    # multiz.v10 human.maf mouse.maf 1
    # multiz.v10 human-mouse.maf chicken.maf 1
    # multiz.v10 human-mouse-chicken.maf zfish.maf 1
set c = $1
/iscratch/i/monDom1/multiz/penn/oneMultiz.csh $c hg17 mm5  1
/iscratch/i/monDom1/multiz/penn/oneMultiz.csh $c galGal2 hg17mm5 1
/iscratch/i/monDom1/multiz/penn/oneMultiz.csh $c danRer2 galGal2hg17mm5 1
# get final alignment file
/iscratch/i/monDom1/multiz/penn/oneMultiz.csh $c danRer2galGal2hg17mm5 /cluster/data/monDom1/bed/multiz/maf/$c.maf
#cleanup
/iscratch/i/monDom1/multiz/penn/oneMultiz.csh $c
'EOF'
# << keep emacs coloring happy
chmod +x allMultiz.csh

cat  << 'EOF' > template
#LOOP
./allMultiz.csh $(root1) {check out line+ /cluster/data/monDom1/bed/multiz/maf/$(root1).maf}
#ENDLOOP
'EOF'

    #	make a list of scaffolds that successfully had actual alignments
    #	at least once somewhere.  Not every scaffold has an alignment.
    cd /iscratch/i/monDom1/multiz
    for D in danRer2 galGal2 hg17 mm5
    do
        cd $D
        ls | sed -e "s/.maf//"
        cd ..
    done | sort -u >  /cluster/data/monDom1/bed/multiz/run/chrom.lst

    cd /cluster/data/monDom1/bed/multiz/run

    gensub2 chrom.lst single template jobList
    para create jobList
    para try; para check
    para push
# Completed: 5517 of 5517 jobs
# CPU time in finished jobs:       8337s     138.95m     2.32h    0.10d  0.000 y
# IO & Wait Time:                 15166s     252.77m     4.21h    0.18d  0.000 y
# Average job time:                   4s       0.07m     0.00h    0.00d
# Longest job:                       65s       1.08m     0.02h    0.00d
# Submission to last job:          1495s      24.92m     0.42h    0.02d

    #	combine results into a single file for loading
    ssh eieio
    cd /cluster/data/monDom1/bed/multiz
    catDir maf | mafFilter stdin -minScore=500 > multiz5way.maf

# Load into database  (DONE - 2005-01-04 - Hiram)
  ssh hgwdev
  cd /cluster/data/monDom1/bed/multiz
  mkdir -p /gbdb/monDom1/multiz5way
  ln -s /cluster/data/monDom1/bed/multiz/multiz5way.maf /gbdb/monDom1/multiz5way
  hgLoadMaf monDom1 multiz5way
    #	Loaded 988922 mafs in 1 files from /gbdb/monDom1/multiz5way
    #	4 minutes to load

# Create pairwise maf files (no longer split across scaffolds)
   ssh eieio
   cd /cluster/data/monDom1/bed/multiz
   mkdir -p pairwise
   catDir galGal2 > pairwise/galGal2.maf
   catDir hg17 > pairwise/hg17.maf
   catDir mm5 > pairwise/mm5.maf
   catDir danRer2 > pairwise/danRer2.maf

# Load pairwise mafs into database  (DONE - 2005-01-04 - Hiram)
   ssh  hgwdev
   set h = /cluster/data/monDom1/bed/multiz/pairwise
   cd $h
   foreach i (*.maf)
      set o = $i:r
      set t = ${o}_netBlastz
      mkdir -p /gbdb/monDom1/multiz5way/$t
      ln -s $h/$i /gbdb/monDom1/multiz5way/$t
      hgLoadMaf monDom1 $t -pathPrefix=/gbdb/monDom1/multiz5way/$t
   end

# CREATE CONSERVATION WIGGLE WITH PHASTCONS (WORKING - 2005-01-04 - Hiram)

# Estimate phastCons parameters
  ssh eieio
  mkdir /cluster/data/monDom1/bed/multiz/cons
  cd /cluster/data/monDom1/bed/multiz/cons

  # Create a starting-tree.mod based on scaffold_13303. (the largest one)
  twoBitToFa ../../../monDom1.2bit -seq=scaffold_13303 scaffold_13303.fa
  /cluster/bin/phast/msa_split ../maf/scaffold_13303.maf --refseq scaffold_13303.fa --in-format MAF --windows 100000000,1000 --out-format SS --between-blocks 5000 --out-root s1
(((hg17,mm5),galGal2),danRer2)
  /cluster/bin/phast/phyloFit -i SS s1.*.ss --tree "((((hg17,mm5),galGal2),monDom1),danRer2)" --out-root starting-tree
  rm scaffold_13303.fa s1.*.ss

  # Create directory full of sequence in individual scaffold files on bluearc.
  ssh kksilo
  mkdir -p /cluster/bluearc/monDom1/scaffoldFa
  cd /cluster/bluearc/monDom1/scaffoldFa
  twoBitToFa /cluster/data/monDom1/monDom1.2bit stdout | faSplit byname stdin foo

  # Create big bad bloated SS files in bluearc (takes ~45 minutes)
  mkdir -p /cluster/bluearc/monDom1/cons/ss
  cd /cluster/bluearc/monDom1/cons/ss
  foreach i (`awk '{print $1}' /cluster/data/monDom1/chrom.sizes`)
      if (-e /cluster/data/monDom1/bed/multiz/maf/$i.maf) then
	  echo msa_split $i
	  /cluster/bin/phast/msa_split /cluster/data/monDom1/bed/multiz/maf/$i.maf \
	     --refseq /cluster/bluearc/monDom1/scaffoldFa/$i.fa \
	     --in-format MAF --windows 1000000,0 --between-blocks 5000 \
	     --out-format SS --out-root $i
      endif
  end

  # Create a random list of 50 1 mb regions 
  ls -l | awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs

  # Set up parasol directory to calculate trees on these 50 regions
  ssh kki
  mkdir /cluster/bluearc/monDom1/cons/treeRun1
  cd /cluster/bluearc/monDom1/cons/treeRun1
  mkdir tree log

  # Create little script that calls phastCons with right arguments
  #	I wonder if target-coverage here has to go along with the
  #	readjusted target-coverage below as it is tuned up ?
  #	In that case, this would be 0.50 here.
  cat > makeTree << '_EOF_'
  /cluster/bin/phast/phastCons ../ss/$1.ss \
      /cluster/data/monDom1/bed/multiz/cons/starting-tree.mod \
      --gc 0.410 --nrates 1,1 --no-post-probs --ignore-missing \
      --expected-lengths 12 --target-coverage 0.35 \
      --quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
  chmod a+x makeTree

  # Create gensub file
  cat > template << '_EOF_'
#LOOP
makeTree $(root1)
#ENDLOOP
'_EOF_'

  # Make cluster job and run it
  gensub2 ../randomSs single template jobList
  para create jobList
  para try/push/check/etc
# Completed: 50 of 50 jobs
# CPU time in finished jobs:       1526s      25.43m     0.42h    0.02d  0.000 y
# IO & Wait Time:                   650s      10.84m     0.18h    0.01d  0.000 y
# Average job time:                  44s       0.73m     0.01h    0.00d
# Longest job:                       84s       1.40m     0.02h    0.00d
# Submission to last job:           168s       2.80m     0.05h    0.00d

  # Now combine parameter estimates.  We can average the .mod files
  # using phyloBoot.  This must be done separately for the conserved
  # and nonconserved models
  ls tree/*.cons.mod > cons.txt
  /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' --output-average ../ave.cons.mod > cons_summary.txt
  ls tree/*.noncons.mod > noncons.txt
  /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' --output-average ../ave.noncons.mod > noncons_summary.txt
  cd ..
  cp -p ave.*.mod /cluster/data/monDom1/bed/multiz/cons

  ssh kk9
  # Create cluster dir to do main phastCons run
  mkdir  /cluster/bluearc/monDom1/cons/consRun1
  cd  /cluster/bluearc/monDom1/cons/consRun1
  mkdir ppRaw bed

  # Create script to run phastCons with right parameters
cat > doPhast << '_EOF_'
mkdir -p ppRaw/$2
/cluster/bin/phast/phastCons ../ss/$1.ss ../ave.cons.mod,../ave.noncons.mod \
   --expected-lengths 12 --target-coverage 0.50 --quiet --seqname $2 \
   --idpref $2 --viterbi bed/$1.bed --score --require-informative 0 > \
   ppRaw/$2/$1.pp
'_EOF_'
  chmod a+x doPhast

  # Create gsub file
cat > template << '_EOF_'
#LOOP
doPhast $(file1) $(root1)
#ENDLOOP
'_EOF_'

  # Create parasol batch and run it
  ls -1 ../ss | sed 's/.ss//' > in.lst
  gensub2 in.lst single template jobList
  para create jobList
  para try/check/push/etc.
# Completed: 8171 of 8171 jobs
# CPU time in finished jobs:      24641s     410.68m     6.84h    0.29d  0.001 y
# IO & Wait Time:                 81696s    1361.60m    22.69h    0.95d  0.003 y
# Average job time:                  13s       0.22m     0.00h    0.00d
# Longest job:                       39s       0.65m     0.01h    0.00d
# Submission to last job:          1128s      18.80m     0.31h    0.01d

    # combine predictions and transform scores to be in 0-1000 interval
    catDir bed | awk \
    	'{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
        /cluster/bin/scripts/lodToBedScore /dev/stdin > \
	/cluster/data/monDom1/bed/multiz/mostConserved.bed

    # Figure out how much is actually covered by the bed files as so:
    awk '{print $3 - $2;}' /cluster/data/monDom1/bed/multiz/mostConserved.bed | addCols stdin
    at 0.35: 99792852.00
    at 0.55: 169027762
    at 0.50: 140230712.00
    at 0.35: 99792852/3492108230 = .028576
    at 0.55: 169027762/3492108230 = .048402 with two points claiming to
	to be negative at 0.00000 changed to 0
    at 0.50: 140230712.00/3492108230 = .040156 - no unusual points noted


    # If the results of the this divided by the non-n genome size (1.5G) aren't
    # around 4%, then do it again, adjusting the target-coverage phastCons
    # parameter.  Beware of negative scores when too high.  The logToBedScore
    # will output an error on any negative scores.

    # Load most conserved track into database
    ssh hgwdev
    cd /cluster/data/monDom1/bed/multiz
    hgLoadBed monDom1 mostConserved mostConserved.bed

    # Create merged posterier probability file and wiggle track data files
    ssh eieio
    cd /cluster/bluearc/monDom1/cons/consRun1
    # interesting sort here on scaffold number and position.
    # first convert the names to have a consistent field separator
    # of . then do the sort on fields 6 and 7 (both numerically)
    # then convert those . field separators back to what they were
    find ./ppRaw -type f | \
	sed -e "s#/scaffold_#.scaffold.#g; s#-#._.#" | \
	    sort -t\. -k6,6n -k7,7n | \
		sed -e "s#.scaffold.#/scaffold_#g; s#\._\.#-#" | xargs cat | \
		    wigEncode stdin phastCons5.wig phastCons5.wib
    # 30 minutes for above
    cp -p phastCons5.wi? /cluster/data/monDom1/bed/multiz/cons
    # 1 minute copy
    
    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/monDom1/bed/multiz/cons
    ln -s `pwd`/phastCons5.wib /gbdb/monDom1/wib/phastCons5.wib
    hgLoadWiggle monDom1 phastCons5 phastCons5.wig
    #  2 minute load

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/monDom1/bed/multiz/cons
    time hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=monDom1 phastCons5 > histogram.data 2>&1
    #	about 15 minutes scan all data

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
        x000000 xffffff xc000ff x66ff66 xffff00 xff0000 xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Opossum Histogram phastCons5 track"
set xlabel " phastCons5 score"
set ylabel "p-Value"
set y2label "Cumulative Probability Distribution"
set y2range [0:1]
set y2tics

plot "histogram.data" using 2:5 title " pValue" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CPD" with lines
'_EOF_'

    display histo.png &

# Pick up photo from NHGRI (DONE - 2004-12-22 - Hiram)
    ssh hgwdev
    cd /tmp
    wget \
	'http://www.genome.gov/Pages/News/Photos/Science/opossum_image.jpg'
	-O opossum_image.jpg
    #	view that image in 'display' to determine crop edges, then:
    convert -crop 1712x1240+560+360 -quality 80 -sharpen 0 \
	-normalize opossum_image.jpg mmd.jpg
    convert -geometry 300x200 -quality 80 mmd.jpg Monodelphis_domestica.jpg
    rm -f mmd.jpg opossum_image.jpg

    cp -p Monodelphis_domestica.jpg /usr/local/apache/htdocs/images
    #	add links to this image in the description.html page, request push

# MAKE DOWNLOADABLE SEQUENCE FILES (WORKING, 2005-01-06, Hiram)
ssh kksilo
cd /cluster/data/monDom1
#- Build the .zip files
rm -rf zip
mkdir zip
#	! 2005-02-08 - Should not be .zip files, should be .gz files
#zip -j zip/softMask.fa.zip maskedScaffolds.fa &
#zip -j zip/hardMask.fa.zip hardMaskedScaffolds.fa &
#zip -j zip/novel.RM.lib.fa.zip jkStuff/monDom1.novel.RM.lib &
#zip ../../zip/trf.bed.zip trfMask.bed &

# RepeatMasker library
gzip -c jkStuff/monDom1.novel.RM.lib > zip/novel.RM.lib.fa.gz
touch -r jkStuff/monDom1.novel.RM.lib zip/novel.RM.lib.fa.gz
# soft masked scaffold fasta
gzip -c maskedScaffolds.fa > zip/softMask.fa.gz
touch -r maskedScaffolds.fa zip/softMask.fa.gz
# hard masked scaffold fasta
gzip -c hardMaskedScaffolds.fa > zip/hardMask.fa.gz
touch -r hardMaskedScaffolds.fa zip/hardMask.fa.gz
# TRF output files
gzip -c bed/simpleRepeat/trfMask.bed > zip/trf.bed.gz
touch -r bed/simpleRepeat/trfMask.bed zip/trf.bed.gz


# get GenBank native mRNAs
cd /cluster/data/genbank
./bin/i386/gbGetSeqs -db=monDom1 -native GenBank mrna \
        /cluster/data/monDom1/zip/mrna.fa
# get GenBank xeno mRNAs
./bin/i386/gbGetSeqs -db=monDom1 -xeno GenBank mrna \
        /cluster/data/monDom1/zip/xenoMrna.fa &
# THERE ARE NO native ESTs - skip this - get native GenBank ESTs
# ./bin/i386/gbGetSeqs -db=monDom1 -native GenBank est \
# /cluster/data/monDom1/zip/est.fa &

cd /cluster/data/monDom1/zip
# zip GenBank native and xeno mRNAs and native ESTs, 
# after the above gbGetSeqs jobs are done.
# zip -j mrna.fa.zip mrna.fa 
# zip -j xenoMrna.fa.zip xenoMrna.fa 
# zip -j est.fa.zip est.fa 
#rm -f mrna.fa xenoMrna.fa est.fa
gzip mrna.fa xenoMrna.fa

#	RepeatMasker out files, we don't have this as traditionally delivered
#	since the process involved turning the data into the database
#	representation so they could be sorted and uniqued.  So instead
#	of delivering the usual files, simply dump the database
ssh hgwdev
mkdir -p /usr/local/apache/htdocs/goldenPath/monDom1/bigZips
cd /usr/local/apache/htdocs/goldenPath/monDom1/bigZips
hgsqldump monDom1 rmsk -T .
#zip -j rmsk.txt.zip rmsk.txt &
#rm rmsk.txt
gzip rmsk.txt &

ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/monDom1
rsync -a --progress /cluster/data/monDom1/zip/ .

for i in *.gz
do
    /bin/echo -n "$i - "
    zcat $i | sum -r
done
# Check zip file integrity
#for i in *.zip
#do
#    unzip -t $i | tail -1
#done
#	No errors detected in compressed data of hardMask.fa.zip.
#	No errors detected in compressed data of mrna.fa.zip.
#	No errors detected in compressed data of novel.RM.lib.fa.zip.
#	No errors detected in compressed data of rmsk.txt.zip.
#	No errors detected in compressed data of softMask.fa.zip.
#	No errors detected in compressed data of trf.bed.zip.
#	No errors detected in compressed data of xenoMrna.fa.zip.

# md5sum *.zip > md5sum.txt
md5sum *.gz > md5sum.txt
# update README.txt.

# ACCESSIONS FOR CONTIGS (DONE 2005-03-16 kate)
#  Used for Assembly track details, and also ENCODE AGP's
    cd /cluster/data/monDom1/bed
    mkdir -p contigAcc
    cd contigAcc
    
    # extract WGS contig to accession mapping from Entrez Nucleotide summaries
    # To get the summary file, access the Genbank page for the project 
    # by searching:
    #       genus[ORGN] AND WGS[KYWD]
    # At the end of the page, click on the list of accessions for the contigs.
    # Select summary format, and send to file.
    # output to file <species>.acc

    contigAccession opossum.acc > contigAcc.tab
    wc -l contigAcc.tab
        # 109065 contigAcc.tab
     grep contig /cluster/data/monDom1/broad.mit.edu/assembly.agp | wc -l
        # 109065
    hgsql monDom1 < $HOME/kent/src/hg/lib/contigAcc.sql
    echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \
        hgsql monDom1
    hgsql monDom1 -e "SELECT COUNT(*) FROM contigAcc"
        # 109065 contigAcc.tab

#########################################################################
# MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram)

# EXTRACT AXTs and MAFs FROM OPOSSUM (monDom1) NET FOR ZEBRAFISH (danRer2)
# zb.danRer2 are the blastz alignments, chains and nets for 
# danRer2 as target and monDom1 as query. (DONE, 2005-05-13, hartera)
# GZIP AXTs AND MAFs (DONE, 2005-05-15, hartera)

    ssh kkstore01
    # create axts
    cd /cluster/data/monDom1/bed/blastz.monDom1.to.danRer2/axtChain
    gunzip zfish.net.gz
    gunzip chain/*.chain.gz
    netSplit zfish.net zfishNet
    mkdir -p ../axtNet
cat > axtNet.csh << 'EOF'
    foreach f (zfishNet/chr*.net)
        set c = $f:t:r
        echo "axtNet on $c"
        netToAxt zfishNet/$c.net chain/$c.chain \
                 /cluster/data/danRer2/nib \
                 /cluster/data/monDom1/monDom1.2bit ../axtNet/$c.axt
    echo "Complete: $c.net -> $c.axt"
    end
'EOF'
    chmod +x axtNet.csh
    csh axtNet.csh >&! axtNet.log &
    tail -100f axtNet.log
    # sort axts before making mafs - must be sorted for multiz
    cd /cluster/data/monDom1/bed/blastz.monDom1.to.danRer2
    mv axtNet axtNet.unsorted
    mkdir axtNet
    foreach f (axtNet.unsorted/*.axt)
        set c = $f:t:r
        echo "Sorting $c"
        axtSort $f axtNet/$c.axt
    end
    # create maf
    ssh kkstore01
    cd /cluster/data/monDom1/bed/blastz.monDom1.to.danRer2/axtNet
    mkdir ../mafNet
cat > makeMaf.csh << 'EOF'
    foreach f (chr*.axt)
      set maf = $f:t:r.monDom1.maf
      echo translating $f to $maf
      axtToMaf $f \
          /cluster/data/danRer2/chrom.sizes /cluster/data/monDom1/chrom.sizes \
          ../mafNet/$maf -tPrefix=danRer2.  -qPrefix=monDom1.
    end
'EOF'
    chmod +x makeMaf.csh
    csh makeMaf.csh >&! makeMaf.log &
    tail -100f makeMaf.log
    cd /cluster/data/monDom1/bed/blastz.monDom1.to.danRer2/axtChain
    nice gzip zfish.net chain/*.chain &
    cd /cluster/data/monDom1/bed/blastz.monDom1.to.danRer2
    nice gzip axtNet/*.axt mafNet/*.maf &


# LIFTOVER CHAINS TO MONDOM2 (WORKING 2006-02-07 kate)
    # NOTE: split monDom2 is doc'ed in makeMonDom2.doc

    ssh kkstore01
    cd /cluster/data/monDom1

    # split scaffolds and convert to 2bit, as done for cow
    mkdir split2bit
    cd split2bit
    faSplit sequence ../maskedScaffolds.fa 50 x.
    foreach f (*.fa)
        echo $f
        faToTwoBit $f $f:r.2bit
        rm $f
    end
    cd ..
    mkdir -p /san/sanvol1/scratch/monDom1
    cp -r split2bit /san/sanvol1/scratch/monDom1
    rm -fr split2bit
    cp /cluster/bluearc/monDom1/11.ooc /san/sanvol1/scratch/monDom1/

    # run alignment
    ssh pk
    cd /cluster/data/monDom1
    makeLoChain-align monDom1 /san/sanvol1/scratch/monDom1/split2bit \
                      monDom2 /san/sanvol1/scratch/monDom2/liftSplits/split \
                        /san/sanvol1/scratch/monDom1/11.ooc
    cd bed
    rm blat.monDom2
    ln -s blat.monDom2.2006-02-07 blat.monDom2
    cd blat.monDom2/run
    para try
    para check
    para push

    # 2500 jobs

    # All jobs crash when run via cluster, though a test
    # manual job on a cluster job ran successfully:
#   blat /san/sanvol1/scratch/monDom1/split2bit/x18.2bit /san/sanvol1/scratch/monDom2/liftSplits/split/x19.fa ../raw/x18_x19.psl  -tileSize=11 -ooc=/san/sanvol1/scratch/monDom1/11.ooc -minScore=100 -minIdentity=98 -fastMap
# Loaded 83749395 letters in 35 sequences
# Searched 71987986 bases in 7199 sequences

    # the job took ~7 hours -- 
    # this needs to be broken up into much smaller jobs.


