# for emacs: -*- mode: sh; -*-


#	Creating the assembly for Monodelphis domestica
#	South American, Short-tailed Opossum
#	http://www.genome.gov/11510687
#	http://www.genome.gov/12512285

#########################################################################
# DOWNLOAD SEQUENCE (DONE - 2005-08-18 - Hiram)
    ssh kkstore01
    mkdir -p /cluster/store1/monDom2/broad.mit.edu
    ln -s /cluster/store1/monDom2 /cluster/data/monDom2
    cd /cluster/data/monDom2/broad.mit.edu
    wget --timestamping --force-directories --directory-prefix=. \
        --dont-remove-listing --recursive --level=4 --no-parent \
        --no-host-directories --cut-dirs=4 \
        ftp://ftp.broad.mit.edu/distribution/assemblies/mammals/monDom2/*
    #	That takes a number of hours, ending up with:
# -rw-rw-r--  1        843 Jun 14 17:34 ForDistribution.command
# -rw-rw-r--  1 1199352275 Jun 14 17:35 V3.0_prel.agp.chromosome.fasta.gz
# -rw-rw-r--  1    6918810 Jun 14 17:36 assembly.agp
# -rw-rw-r--  1  470824048 Jun 14 17:36 V3.0_prel.agp.chromosome.qual.gz
# -rw-rw-r--  1  467679128 Jun 14 17:37 assembly.quals.gz
# -rw-rw-r--  1   14839262 Jun 14 17:37 assembly.markup
# -rw-rw-r--  1    2577871 Jun 14 17:37 assembly.links
# -rw-rw-r--  1 1196979152 Jun 14 17:37 assembly.bases.gz
# -rw-rw-r--  1         64 Jun 14 17:41 source
# -rw-rw-r--  1  175152051 Jun 14 17:41 assembly.unplaced
# -rw-rw-r--  1 3568712708 Jun 14 17:41 assembly.reads
# -rw-rw-r--  1  760535060 Jun 14 17:42 unplaced.fasta.gz
# -rw-rw-r--  1 2092672289 Jun 14 17:43 unplaced.qual.gz

#########################################################################
# DATA INTEGRITY VERIFICATION (DONE - 2005-08-18 - Hiram)
    ssh kkstore01
    mkdir /cluster/data/monDom2/dataCheck
    cd /cluster/data/monDom2/dataCheck
    zcat ../broad.mit.edu/assembly.bases.gz | grep "^>" \
	| sed -e "s/^>//" > contigs.names 
    #	should be no duplicates
    sort contigs.names | wc
    #	65946   65946  846188
    sort -u contigs.names | wc
    #	65946   65946  846188

    sort contigs.names | head -3
    #	contig_0
    #	contig_1
    #	contig_10
    sort contigs.names | tail -3
    #	contig_9997
    #	contig_9998
    #	contig_9999
    wc contigs.names
    #	65946  65946 846188 contigs.names
    zcat ../broad.mit.edu/assembly.quals.gz | grep "^>" \
	| sed -e "s/^>//" > contigs.quals.names 
    #	should be the same:
    sum -r contigs*
    #	39126   827 contigs.names
    #	39126   827 contigs.quals.names
    awk '{print $6}' ../broad.mit.edu/assembly.agp \
	| grep contig_ > assembly.agp.contig.names
    #   The assembly agp contig list appears to be the same contig list
    sum -r assembly.agp.contig.names
    #	39126   827
    awk '{print $1}' ../broad.mit.edu/assembly.agp \
	| sort -u > assembly.agp.scaffold.names
    wc assembly.agp.scaffold.names
    #	5077  5077 69968 assembly.agp.scaffold.names
    head -3 assembly.agp.scaffold.names
    #	scaffold_0
    #	scaffold_1
    #	scaffold_10
    tail -3 assembly.agp.scaffold.names
    #	scaffold_997
    #	scaffold_998
    #	scaffold_999
    zcat ../broad.mit.edu/V3.0_prel.agp.chromosome.fasta.gz | \
	grep "^>" | sed -e "s/^>//; s/\..*//" > assembly.fasta.scaffold.names
    #	600 duplicate names in this listing:
    sort assembly.fasta.scaffold.names | wc
    #	5677    5677   76915
    sort -u  assembly.fasta.scaffold.names | wc
    #	5077    5077   69968
    #	This is due to their partitioning of large sequences into
    #	5 Mb chunks, e.g:
    #	>scaffold_0.1-5000000 (TestV3.0_prel)
    #	>scaffold_0.5000001-10000000 (TestV3.0_prel)
    #	>scaffold_0.10000001-15000000 (TestV3.0_prel)
    faCount ../broad.mit.edu/V3.0_prel.agp.chromosome.fasta.gz \
	> faCount.assembly.agp.fasta
#   seq          len               A              C               G
# total   3575239585      1087726608      661068673       661056691
#                      T             N             cpg
#	      1086593681      78793932        16830562
    #	Shortest contig:
    sort -k2,2n faCount.assembly.agp.fasta | head
    #	scaffold_5076 at 2002 bases
    #	And many at the 5,000,000 chunk size.
    faCount ../broad.mit.edu/assembly.bases.gz > faCount.contigs
    #	there are 65,946 contigs
#   seq          len               A              C               G
# total   3496445653      1087726608      661068673       661056691
#                T      N            cpg
#	1086593681      0       16830563
    #	shortest contig is 2000 bases (about 10 of this size)
    #	longest is 1,015,767 (contig_37046)
    grep "^contig" faCount.contigs | awk '{print $2}' \
	| textHistogram -binSize=1000 -minVal=2000 -maxBinCount=1000 stdin \
	    | head
    #	beginning of histogram:
# 2000 ************************************************************ 4175
# 3000 ******************************************** 3072
# 4000 *************************** 1862
# 5000 ******************************** 2199
# 6000 ****************************** 2121
# 7000 ***************************** 1986
# 8000 ************************ 1662
# 9000 ******************* 1335
#10000 ***************** 1164
#11000 **************** 1126

#########################################################################
# COMBINE scaffold fasta pieces into single chunks, remove the extra
#       base-range info from the names. (DONE - Hiram - 2005-08-19)
    ssh kkstore01
    cd /cluster/data/monDom2/broad.mit.edu
    cat << '_EOF_' > collapseScaffolds.sh
#!/bin/sh
#
zcat V3.0_prel.agp.chromosome.fasta.gz | awk '
BEGIN { name="" }
/^>/ {
    id=$1
    sub(">","",id)
    sub("\\..*","",id)
    if (!match(name,id)) {
        printf ">%s\n", id
        name=id
        }
}
/^[^>]/ { print }
'
'_EOF_'
    # happy emacs

    chmod +x collapseScaffolds.sh
    time ./collapseScaffolds.sh | gzip > scaffolds.fa.gz
    #   17 minutes
    # make sure we have not damaged the sequence:
    time faCount scaffolds.fa.gz > faCount.scaffolds.fa
    #   2 minutes
    tail -1 faCount.scaffolds.fa
#   seq          len               A              C               G
# total   3575239585      1087726608      661068673       661056691
#                      T             N             cpg
#	      1086593681      78793932        16830563
    #   Those are the same numbers as above (well, one more cpg)
    #			that could happen if one more is found at a
    #			boundary break

#########################################################################
# MAKE 2BIT NIB FILE (DONE - Hiram - 2005-08-19)
#	This is rebuilt again after repeat and simple masking are applied.
#	This first copy is just to get a browser running.
    ssh kkstore01
    cd /cluster/data/monDom2
    time faToTwoBit broad.mit.edu/scaffolds.fa.gz monDom2.2bit
    #	2 minutes
    #   check that the sequence hasn't been damaged
    time twoBitToFa monDom2.2bit stdout | faCount stdin > faCount.2bit
    tail -1 faCount.2bit
#   seq          len               A              C               G
# total   3575239585      1087726608      661068673       661056691
#                      T             N             cpg
#	      1086593681      78793932        16830563
    #   still the same numbers, looking good
    #	The largest scaffold (0) is 199,827,570 !
    #	with seven of them over 100,000,000

#########################################################################
# CREATE DATABASE (DONE - Hiram - 2004-12-02)
    ssh hgwdev
    cd /cluster/data/monDom2
    mkdir /gbdb/monDom2
    ln -s /cluster/data/monDom2/monDom2.2bit /gbdb/monDom2
    twoBitInfo monDom2.2bit stdout |
        awk '{printf "%s\t%s\t/gbdb/monDom2/monDom2.2bit\n", $1,$2}' \
        > chromInfo.tab

    hgsql -e "create database monDom2;" mysql
    hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp;" \
                monDom2
    hgsql monDom2 < $HOME/kent/src/hg/lib/chromInfo.sql
    hgsql -e 'load data local infile "chromInfo.tab" into table chromInfo;' \
        monDom2
    #   generate chrom.sizes list in order by size, biggest first
    #   This listing is going to be used in a variety of procedures
    #   below
    twoBitInfo monDom2.2bit stdout | sort -rn +1 > chrom.sizes
    hgsql monDom2 -N -e 'select chrom from chromInfo' > chrom.lst

    # Enter monDom2 into dbDb and defaultDb so test browser knows about
    # it:
    hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
        defaultPos, active, orderKey, genome, scientificName, \
        htmlPath, hgNearOk, hgPbOk, sourceName) \
        VALUES("monDom2", "June 2005", "/gbdb/monDom2", "Opossum", \
        "scaffold_0:1000000-11000000", 1, 34, "Opossum", \
        "Monodelphis domestica", \
        "/gbdb/monDom2/html/description.html", 0, 0, \
        "Broad Inst. V3 Prelim Jun05");' \
        -h localhost hgcentraltest
    #	do this update later when it is time to make this be the default
    hgsql -e 'update defaultDb set name="monDom2" where genome="Opossum";' \
	hgcentraltest
    hgsql -e 'update dbDb set defaultPos="scaffold_0:1000000-11000000" where name="monDom2";' \
	hgcentraltest

    # Make trackDb table so browser knows what tracks to expect
    mkdir /cluster/data/monDom2/html
    ln -s /cluster/data/monDom2/html /gbdb/monDom2/html
    mkdir $HOME/kent/src/hg/makeDb/trackDb/opossum/monDom2
    cd $HOME/kent/src/hg/makeDb/trackDb/opossum
    cvs add monDom2
    cd monDom2
    cp -p /cluster/data/monDom1/html/description.html ./description.html
    #	edit that description to update
    cvs add description.html
    cvs commit
    cd $HOME/kent/src/hg/makeDb/trackDb
    #	add monDom2 to the makefile, then
    make DBS=monDom2

#######################################################################
# LOAD GAP & GOLD TABLES FROM AGP (DONE - 2005-08-22 - Hiram)
    ssh hgwdev
    cd /cluster/data/monDom2
    hgGoldGapGl -noGl monDom2 broad.mit.edu/assembly.agp
    #   Verify sanity of indexes
    hgsql -e "show index from gold;" monDom2
    hgsql -e "show index from gap;" monDom2
    #   Look at the Cardinality column, it should have some healthy
    #   numbers for all indices
    # For some reason, the indices do not get built correctly --
    # "show index from gap/gold" shows NULL cardinalities for chrom.  
    # Rebuild indices with "analyze table".
    hgsql -e "analyze table gold; analyze table gap;" monDom2

#######################################################################
#  CHUNK up scaffolds for repeat masking (DONE - Hiram - 2005-08-19)
    ssh kkstore01
    cd /cluster/data/monDom2
    mkdir scripts
    #	breaks all scaffolds at 500,000 boundaries, result is to a
    #	single file.  The tiny scaffolds under that size remain intact.
    time faSplit size broad.mit.edu/scaffolds.fa.gz 500000 -oneFile \
          scaffoldsSplit -lift=scripts/scaffoldsSplit.lft
    #	2 minutes

    #	Now split that single file into 200,000 chunks, keeping fa record
    #	boundaries, this has the effect of putting all the tiny
    #	scaffolds together in larger single files.  The 500,000 scaffold
    #	chunks will be their own separate files.
    mkdir chunks500k
    time faSplit about scaffoldsSplit.fa 200000 chunks500k/chunk_
    #	2 minutes, makes over 7000 files:
    ls chunks500k | wc
    #	344    7344  101716
    #   copy to bluearc for efficient transfer to iservers
    mkdir /cluster/bluearc/monDom2
    rsync -a --progress `pwd`/chunks500k/ /cluster/bluearc/monDom2/chunks500k/

    #	Then to the iservers
    ssh kkr1u00
    mkdir /iscratch/i/monDom2
    cd /iscratch/i/monDom2
    rsync -a --progress /cluster/bluearc/monDom2/chunks500k/ ./chunks500k/

    #	And the special repeat masker library from previous opossum
    cp -p /iscratch/i/monDom1/monDom1.novel.RM.lib . 
    #	efficient rsync, just this data
    for i in 2 3 4 5 6 7 8
do
    rsync -a --progress --delete --rsh=rsh /iscratch/i/monDom2/ \
	kkr${i}u00:/iscratch/i/monDom2/
done

#######################################################################
# GC5BASE (DONE - 2005-08-19 - Hiram)
#	This can be done while the split above and copy is taking place.
    ssh kkstore01
    mkdir /cluster/data/monDom2/bed/gc5Base
    cd /cluster/data/monDom2/bed/gc5Base
    time hgGcPercent -wigOut -doGaps -file=stdout -win=5 monDom2 \
        /cluster/data/monDom2 | wigEncode stdin gc5Base.wig gc5Base.wib
    #	19 minutes
    wc gc5Base.wig
    #	693243  9012159 72153841 gc5Base.wig

    ssh hgwdev
    cd /cluster/data/monDom2/bed/gc5Base
    mkdir /gbdb/monDom2/wib
    ln -s `pwd`/gc5Base.wib /gbdb/monDom2/wib
    hgLoadWiggle monDom2 gc5Base gc5Base.wig

#######################################################################
# RUN REPEAT MASKER second time (DONE - 2006-01-20 - 2006-01-23 - Hiram)
    ssh pk
    #	move the older RM directories aside, then start anew:
    mkdir /cluster/data/monDom2/RMRun
    cd /cluster/data/monDom2/RMRun
    #	Initialize the RM libraries for this species on this new repeat
    #	masker
    /cluster/bluearc/RepeatMasker060120/RepeatMasker \
	-species "Monodelphis domestica" /dev/null

# RepeatMasker version development-$Id: RepeatMasker,v 1.11
#	2006/01/20 21:10:59 hiram Exp $
# Search engine: Crossmatch
# Using Master Database:
#	/cluster/bluearc/RepeatMasker060120/Libraries/RepeatMaskerLib.embl
# Using general libraries in:
# /cluster/bluearc/RepeatMasker060120/Libraries/20060120/general
# Building species libraries in:
#  /cluster/bluearc/RepeatMasker060120/Libraries/20060120/monodelphis_domestica
# 
    #	That creates these five files:
    ls -og \
  /cluster/bluearc/RepeatMasker060120/Libraries/20060120/monodelphis_domestica
    #	-rw-rw-r--    1    57952 Jan 20 14:22 shortcutlib
    #	-rw-rw-r--    1    20772 Jan 20 14:22 cutlib
    #	-rw-rw-r--    1    86793 Jan 20 14:22 shortlib
    #	-rw-rw-r--    1    89451 Jan 20 14:22 longlib
    #	-rw-rw-r--    1   194233 Jan 20 14:22 retrolib

    cd /cluster/data/monDom2

    cat << '_EOF_' > scripts/RMOpossumV3
#!/bin/csh -fe

cd $1
/bin/mkdir -p /scratch/tmp/monDom2/$2
/bin/cp /san/sanvol1/scratch/monDom2/chunks500k/$2 /scratch/tmp/monDom2/$2/
pushd /scratch/tmp/monDom2/$2
/cluster/bluearc/RepeatMasker060120/RepeatMasker -s \
        -species "Monodelphis domestica" $2
popd
/bin/cp -p /scratch/tmp/monDom2/$2/$2.out ./$2.out
/bin/rm -fr /scratch/tmp/monDom2/$2/*
/bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom2/$2
/bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom2
'_EOF_'
    # happy emacs
    chmod +x scripts/RMOpossumV3

    mkdir RMRun RMOut
    ls chunks500k | while read chunk
    do
        echo ../scripts/RMOpossumV3 /cluster/data/monDom2/RMOut $chunk \
           '{'check in line+ /san/sanvol1/scratch/monDom2/chunks500k/$chunk'}' \
         '{'check out line+ /cluster/data/monDom2/RMOut/$chunk.out'}'
    done > RMRun/jobList

    cd RMRun
    para create jobList
    para try ... check ... push ... etc ...
XXXX - running 2006-01-20
Completed: 7344 of 7344 jobs
CPU time in finished jobs:   26250196s  437503.27m  7291.72h  303.82d  0.832 y
IO & Wait Time:               1295145s   21585.75m   359.76h   14.99d  0.041 y
Average job time:                3751s      62.51m     1.04h    0.04d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:            7619s     126.98m     2.12h    0.09d
Submission to last job:        248204s    4136.73m    68.95h    2.87d
    #	There were two failures that pushed out the total time
    #	Without any failures it would take about 12 hours

    ssh kkstore01
    cd /cluster/data/monDom2
    #	Place all these results together into a single file
    #	Start with just the header
    head -3 RMOut/chunk_00.fa.out > RMRun/scaffolds.fa.RM.out
    #	Then everything but the header
    time find ./RMOut -type f | sort -t_ -k2,2n | while read F
    do
	tail +4 ${F} >> RMRun/scaffolds.fa.RM.out
    done
    #	real    2m23.526s
    #	7 Million lines, 900 Mb file
    wc RMRun/scaffolds.fa.RM.out
    #	7239721 108831708 933301569 RMRun/scaffolds.fa.RM.out
    #	lift the split scaffolds to scaffold coordinates
    cd RMRun
    time liftUp -type=.out rmsk.out \
		../scripts/scaffoldsSplit.lft warn scaffolds.fa.RM.out

    #	And now, load them up
    ssh hgwdev
    cd /cluster/data/monDom2/RMRun
    time hgLoadOut -verbose=2 -nosplit monDom2 rmsk.out > load.out 2>&1
    #	note: 75 records dropped due to repStart > repEnd
    #	15 minutes

    time featureBits monDom2 rmsk
    #	1811406819 bases of 3496445653 (51.807%) in intersection
    #	real    6m19.242s

    #	the first RM run had a featureBits of:
    #	1770704342 bases of 3496445653 (50.643%) in intersection
    #	the second RM run had a featureBits of:
    #	1769890042 bases of 3496445653 (50.620%) in intersection
    #	real    13m1.867s

    head -3 RMOutExtra/chunk_00.fa.out > RMExtra1/extra.out
    time find ./RMOutExtra -type f | sort -t_ -k2,2n | while read F
    do
	tail +4 ${F} >> RMExtra1/extra.out
    done
#######################################################################
# RUN REPEAT MASKER (DONE - 2005-08-19 - Hiram)(OBSOLETE, see above)
    #	Reusing the special repeat masker addition from monDom1
    #   Will run both -mammal and this one, and combine the results.
    #   the library is a copy from monDom1/jkStuff/monDom1.novel.RM.lib
    #	### !!! *** Next time it would be better to add this library
    #	into the RM set of libraries so only one run would be needed.
    #	### !!! ***
    ssh pk
    mkdir /san/sanvol1/scratch/monDom2
    cd /san/sanvol1/scratch/monDom2
    rsync -a --progress /cluster/bluearc/monDom2/ ./
    cp -p /cluster/data/monDom1/jkStuff/monDom1.novel.RM.lib .

    mkdir /cluster/data/monDom2/RMOut
    cd /cluster/data/monDom2
    #	This script could be smarter and do the job of removing
    #	duplicates from the combination of the two results.  Next time
    #	This can be done as it is done below with the awk script during
    #	the liftUp.
    cat << '_EOF_' > scripts/RMOpossum
#!/bin/csh -fe

cd $1
/bin/mkdir -p /tmp/monDom2/$2
/bin/cp /san/sanvol1/scratch/monDom2/chunks500k/$2 /tmp/monDom2/$2/
pushd /tmp/monDom2/$2
/cluster/bluearc/RepeatMasker050305/RepeatMasker -s -lib /san/sanvol1/scratch/monDom2/monDom1.novel.RM.lib $2
mv $2.out $2.novel.out
/cluster/bluearc/RepeatMasker050305/RepeatMasker -s -species "Monodelphis domestica" $2
tail +4 $2.novel.out >> $2.out
popd
/bin/cp -p /tmp/monDom2/$2/$2.out ./
/bin/rm -fr /tmp/monDom2/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/monDom2/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/monDom2
'_EOF_'
    # happy emacs
    chmod +x scripts/RMOpossum
    mkdir RMRun
    
    ls chunks500k | while read chunk
    do
        echo ../scripts/RMOpossum /cluster/data/monDom2/RMOut $chunk \
           '{'check in line+ /san/sanvol1/scratch/monDom2/chunks500k/$chunk'}' \
         '{'check out line+ /cluster/data/monDom2/RMOut/$chunk.out'}'
    done > RMRun/jobList

    cd RMRun
    para create jobList
    para try ... check ... push ... etc ...
    #   This takes extra long compared to normal because of the two
    #   passes being run on the data
# Completed: 7344 of 7344 jobs
# CPU time in finished jobs:   12118042s  201967.37m  3366.12h  140.26d  0.384 y
# IO & Wait Time:                 46258s     770.97m    12.85h    0.54d  0.001 y
# Average job time:                1656s      27.61m     0.46h    0.02d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            2420s      40.33m     0.67h    0.03d
# Submission to last job:         67144s    1119.07m    18.65h    0.78d
    #	For comparison, this job on monDom1 on the kk kluster was:
# Completed: 7627 of 7627 jobs
# CPU time in finished jobs:  117842813s 1964046.88m 32734.11h 1363.92d  3.737 y
# IO & Wait Time:                704870s   11747.84m   195.80h    8.16d  0.022 y
# Average job time:               15543s     259.05m     4.32h    0.18d
# Longest job:                    24045s     400.75m     6.68h    0.28d
# Submission to last job:        185727s    3095.45m    51.59h    2.15d


    #	the double run above produces duplicate entries for the same
    #	repeat, need to unique the results, and then lift up to scaffold
    #	coordinates  (next time eliminate this step by placing the extra
    #	library into the standard RM libraries)
    ssh kkstore01
    cd /cluster/data/monDom2
cat << '_EOF_' > scripts/uniqueRMOut.sh
#!/bin/sh
FN=$1
if [ ! -f ${FN} ]; then
    echo "Can not find file ${FN}"
    exit 255
fi
#       start with initial header lines
head --lines=3 ${FN}
#	This awk removes the ID number from the end,
#	and it verifies all the lines are of the correct length.
#	The sort following the awk sorts by scaffold part name and
#	chrom start position, the uniq removes duplicate lines, and then
#	the awk following that adds back the ID number (simple sequence
#	count)
awk '
{
if (NF > 14)
  {
  if (! ((NF == 15) || (NF == 16)))
    {
    printf "ERROR: bad line:\n" > "/dev/stderr"
    print $0 > "/dev/stderr"
    }
  for (i=1; i < NF-1; ++i) {printf "%s ", $i}
  i = NF-1
  printf "%s\n", $i;
  }
else
  {
  if (! ((NF == 0) || (NF == 13) || (NF == 14)))
    {
    printf "ERROR: bad line:\n" > "/dev/stderr"
    print $0 > "/dev/stderr"
    }
  }
}
' ${FN} | sort -k5,5 -k6,6n | uniq | awk '
BEGIN { i = 1 }
{ printf "%s %d\n", $0, i++ }
'
'_EOF_'
    # happy emacs
    chmod +x scripts/uniqueRMOut.sh
    mkdir RMOutLifted
    find ./RMOut -type f | while read FN
do
    BN=`basename ${FN}`
    echo "${BN}"
    ./scripts/uniqueRMOut.sh ${FN} \
	| liftUp -type=.out RMOutLifted/${BN} \
		./scripts/scaffoldsSplit.lft warn stdin
done

    #	Place all these results together into a single file
    #	Start with just the header
    head -3 RMOutLifted/chunk_00.fa.out > RMRun/scaffolds.fa.RM.out
    #	Then everything but the header
    find ./RMOutLifted -type f | sort -t_ -k2,2n | while read F
    do
	tail +4 ${F} >> RMRun/scaffolds.fa.RM.out
    done
    #	17 Million lines, 2.2 Gb file

    #	And now, load them up
    ssh hgwdev
    cd /cluster/data/monDom2/RMRun
    time hgLoadOut -nosplit monDom2 scaffolds.fa.RM.out 
    #	note: 74 records dropped due to repStart > repEnd
    #	28 minutes

    time featureBits monDom2 rmsk
    #	1770704342 bases of 3496445653 (50.643%) in intersection
    #	real    15m19.287s
    featureBits monDom1 rmsk
    #	1775646140 bases of 3492108230 (50.847%) in intersection
    time featureBits -countGaps monDom1 rmsk
    #	1775646140 bases of 3563247205 (49.832%) in intersection
    #	real    43m3.513s
    time featureBits -countGaps monDom2 rmsk
    #	1770704342 bases of 3575239585 (49.527%) in intersection
    #	real    15m32.322s

#######################################################################
# SIMPLE REPEAT [TRF] TRACK (DONE - 2005-08-19 - Hiram)
#   This can be done while the above repeat masker run is working
    ssh kk
    mkdir /cluster/data/monDom2/bed/simpleRepeat
    cd /cluster/data/monDom2/bed/simpleRepeat
    mkdir trf
    cat << '_EOF_' > runTrf
#!/bin/csh -fe
#
set path1 = $1
set inputFN = $1:t
set outpath = $2
set outputFN = $2:t
mkdir -p /tmp/$outputFN
cp $path1 /tmp/$outputFN
pushd .
cd /tmp/$outputFN
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
popd
rm -f $outpath
cp -p /tmp/$outputFN/$outputFN $outpath
rm -fr /tmp/$outputFN/*
rmdir --ignore-fail-on-non-empty /tmp/$outputFN
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x runTrf

    cat << '_EOF_' > gsub
#LOOP
./runTrf $(path1) {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    #	Normally would be checking input file, but doesn't work today on kk
    #	./runTrf {check in line+ $(path1)}  {check out line trf/$(root1).bed}

    #	funny business here to work around todays kluster difficulties
    #	kk doesn't seem to have a good connection to iscratch since it
    #	comes from kkr2 which is off-line
    ls -1S /cluster/bluearc/monDom2/chunks500k \
	| sed -e "s#^#/iscratch/i/monDom2/chunks500k/#" > genome.lst
    gensub2 genome.lst single gsub jobList
    para create jobList
    para try ... check ... push ... etc ...
    #	there was one unusual long-running job, not sure what happened there
# Completed: 7344 of 7344 jobs
# CPU time in finished jobs:      53229s     887.15m    14.79h    0.62d  0.002 y
# IO & Wait Time:                 19793s     329.89m     5.50h    0.23d  0.001 y
# Average job time:                  10s       0.17m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            4650s      77.50m     1.29h    0.05d
# Submission to last job:          6054s     100.90m     1.68h    0.07d

    #	lift to scaffold coordinates
    ssh kkstore01
    cd /cluster/data/monDom2/bed/simpleRepeat
    find ./trf -type f | xargs cat | \
        liftUp simpleRepeat.bed \
        /cluster/data/monDom2/scripts/scaffoldsSplit.lft \
        warn stdin  > lu.out 2>&1

     # Load into the database:
    ssh hgwdev
    cd /cluster/data/monDom2/bed/simpleRepeat
    hgLoadBed monDom2 simpleRepeat simpleRepeat.bed \
        -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    #	Loaded 707995 elements of size 16

    #   Compare with previous assembly
    time featureBits monDom2 simpleRepeat
    #	69239555 bases of 3496445653 (1.980%) in intersection
    #	real    2m34.603s
    time featureBits monDom1 simpleRepeat
    #	55000238 bases of 3492108230 (1.575%) in intersection
    #	30 minutes
    @	### !!! *** Why are the featureBits on monDom1 taking so long ?

#######################################################################
# PROCESS SIMPLE REPEATS INTO MASK (DONE - 2004-12-06 - Hiram)
# After the simpleRepeats track has been built, make a filtered
# version 
# of the trf output: keep trf's with period <= 12:
    ssh kkstore01
    cd /cluster/data/monDom2/bed/simpleRepeat
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed

#######################################################################
# MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE - 2005-08-22 - Hiram)
#	RE-DONE with new RM output 2006-01-23
#	RE-DONE with new RM output 2005-12-16
    #	maskOutFa is trying to read in the entire input sequence to get
    #	it ready for masking.  This doesn't work on the ordinary memory
    #	machines since scaffolds.fa is the whole thing.
    ssh kolossus
    mkdir /scratch/monDom2
    cd /scratch/monDom2
    cp -p /cluster/store1/monDom2/broad.mit.edu/scaffolds.fa.gz .
    cp -p /cluster/store1/monDom2/bed/simpleRepeat/trfMask.bed .
    rsync -a --progress kkstore01:/cluster/store1/monDom2/RMRun/rmsk.out .

    maskOutFa -soft scaffolds.fa.gz trfMask.bed maskedScaffolds.fa
    maskOutFa -clip -softAdd maskedScaffolds.fa rmsk.out maskedScaffolds.fa
    #	looks good:
# WARNING: negative rEnd: -15 scaffold_0:42772383-42772698 L1_Mars1
# WARNING: negative rEnd: -569 scaffold_1:112455185-112455386 L1_Mars1
# WARNING: negative rEnd: -27 scaffold_8:83916755-83916961 L1_Mars1
# WARNING: negative rEnd: -679 scaffold_8:83917313-83917408 L1_Mars1
# WARNING: negative rEnd: -16 scaffold_12:60303711-60303766 L3
# WARNING: negative rEnd: -1 scaffold_23:22193902-22193931 L1_Mdo1
# WARNING: negative rEnd: -289 scaffold_28:5288792-5288941 L1_Mars1a
# WARNING: negative rEnd: -501 scaffold_28:5289048-5290587 L1_Mars1a
# WARNING: negative rEnd: -67 scaffold_39:3078357-3078508 L1_Mdo5
# WARNING: negative rEnd: -76 scaffold_116:2053077-2053119 L1_Mdo5
# WARNING: negative rEnd: -322 scaffold_127:944681-944774 HAL1_Opos1

# real    3m45.815s

    #	May as well create a 2bit while we are here:
    time faToTwoBit maskedScaffolds.fa masked.2bit
    #	2 minutes
    #	Then, check it:
    twoBitToFa masked.2bit stdout | faCount stdin | tail -1
    #	looks the same as it was before
#   seq          len               A              C               G
# total   3575239585      1087726608      661068673       661056691
#                      T             N             cpg
#	      1086593681      78793932        16830563
    #	compress the masked fa to prepare it to be copied back:
    time gzip maskedScaffolds.fa
    #	real    30m47.082s

    #	Copy the masked 2bit file back to kksilo
    ssh kkstore01
    cd /cluster/data/monDom2
    time rsync -a --progress kolossus:/scratch/monDom2/masked.2bit .
    #	22 seconds
    #	Since this 2bit file is pointed to from /gbdb/monDom2
    #	replace it quickly:
    rm monDom2.2bit; mv masked.2bit monDom2.2bit
    time rsync -a --progress kolossus:/scratch/monDom2/maskedScaffolds.fa .
    #	1m 20s
    #	clean up unneeded .fa
    rm scaffoldsSplit.fa

    #	And clean up kolossus scratch
    ssh kolossus
    cd /scratch/monDom2
    rm -fr *
    cd
    rmdir /scratch/monDom2

    #	Copy it to bluearc scratch/hg for rsync to kluster nodes:
    ssh kkstore01
    cd /cluster/data/monDom2

#######################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2005-08-22 - Hiram)
    ssh kkstore01
    cd /cluster/data/monDom2
    #	use repMatch of 1228 as this genome is ~ %20 larger than human
    #	1024 + (1024 * 0.2) = 1228
    time blat monDom2.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1228
    #	Wrote 43908 overused 11-mers to 11.ooc
    #	real    2m41.153s

    #	copy to san filesystem for pk kluster use
    ssh pk
    cd /san/sanvol1/scratch/monDom2
    cp -p /cluster/data/monDom2/11.ooc .
    cp -p /cluster/data/monDom2/monDom2.2bit .

##########################################################################
# BLASTZ HUMAN hg18 (STARTED - 2006-01-23 - Hiram)

    ssh pk
    mkdir /cluster/data/hg18/bed/blastzMonDom2.2006-01-23
    cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23

    cat << '_EOF_' > DEF
# human vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Human (hg18)
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Opossum monDom2
SEQ2_DIR=/san/sanvol1/scratch/monDom2/monDom2.2bit
SEQ2_LEN=/san/sanvol1/scratch/monDom2/chrom.sizes
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzMonDom2.2006-01-23
TMPDIR=/scratch/tmp
'_EOF_'
    #	happy emacs

    cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF > blastz.out 2>&1 &
    #	real    912m22.818s

    #	This failed during the load of the chains due to the size of
    #	chr19.chain.  So, go to kolossus:
    ssh kolossus
    #	There isn't any hg18 db here yet, get it established with a
    #	chromInfo and a 2bit sequence:
    hgsql -e "create database hg18;" mysql
    cd /cluster/data/hg18
    twoBitInfo hg18.2bit stdout |
        awk '{printf "%s\t%s\t/gbdb/hg18/hg18.2bit\n", $1,$2}' \
		> chromInfo.kolossus.tab
    hgsql hg18 < $HOME/kent/src/hg/lib/chromInfo.sql
    hgsql hg18 \
-e 'load data local infile "chromInfo.kolossus.tab" into table chromInfo;'
    mkdir /gbdb/hg18
    ln -s /cluster/data/hg18/hg18.2bit /gbdb/hg18/hg18.2bit
    #	now, loading only chr19:
    cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23/axtChain
    hgLoadChain hg18 chr19_chainMonDom2 chain/chr19.chain
    #	while that is running, back on hgwdev, get the other chains loaded
    ssh hgwdev
    cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23/axtChain
    cp loadUp.csh loadUp.noChr19.csh
    #	change the foreach line to eliminate the chr19.chain:
    diff loadUp.csh loadUp.noChr19.csh
    < foreach f (*.chain)
    ---
    > foreach f (`ls *.chain | grep -v chr19.chain`)
    #	And then run that script
    time ./loadUp.noChr19.csh > load.noChr19.out 2>&1
    
    #	When the kolossus load finishes, email to push-request and ask
    #	for the two tables to be pushed from kolossus to hgwdev:
    #	chr19_chainMonDom2
    #	chr19_chainMonDom2Link

    #	then, continuing:
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-continue=download -bigClusterHub=pk -chainMinScore=5000 \
	-chainLinearGap=loose `pwd`/DEF > download.out 2>&1 &
    #	real    2m42.505s

##########################################################################
# BLASTZ HUMAN hg17 (OBSOLETE - 2006-01-23 see above - 2005-08-22 - Hiram)
    ssh pk
    mkdir /cluster/data/hg17/bed/blastzMonDom2.2005_08_22
    cd /cluster/data/hg17/bed/blastzMonDom2.2005_08_22
    cat << '_EOF_' > DEF
# Hg17 vs monDom2
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run.v7
BLASTZ=blastz.v7
# Set up blastz parameters using parameters between (chicken and fish ?)
# but not abridging repeats since can't do that with scaffolds, and
# it's not very relevant at this evolutionary distance.
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Hg17
SEQ1_DIR=/san/sanvol1/scratch/hg17/nib
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: MonDom2
SEQ2_DIR=/san/sanvol1/scratch/monDom2/monDom2.2bit
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/hg17/bed/blastzMonDom2.2005_08_22

SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << keep emacs coloring happy

    cp -p /cluster/data/hg17/chrom.sizes ./S1.len
    twoBitInfo /san/sanvol1/scratch/monDom2/monDom2.2bit stdout | \
	sort -rn +1  > S2.len

    #	establish a screen to control this job
    #	altered local copy of doBlastzChainNet.pl to fix santest location
    #	and run everything on pk since the san location is unique here
    screen
    #	to detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh pk
    screen -d -r
    time ./doBlastzChainNet.pl \
	-bigClusterHub=pk \
	-smallClusterHub=pk \
	-fileServer=kkstore02 \
	`pwd`/DEF > blast.run.out 2>&1 &
# Completed: 140151 of 140151 jobs
# CPU time in finished jobs:   17095319s  284921.98m  4748.70h  197.86d  0.542 y
# IO & Wait Time:                549405s    9156.76m   152.61h    6.36d  0.017 y
# Average job time:                 126s       2.10m     0.03h    0.00d
# Longest finished job:           11323s     188.72m     3.15h    0.13d
# Submission to last job:         82333s    1372.22m    22.87h    0.95d
    #	something failed with the kluster, finished off the kluster run
    #	manually, and then start with the next step: cat
    time ./doBlastzChainNet.pl \
	-continue=cat \
	-bigClusterHub=pk \
	-smallClusterHub=pk \
	-fileServer=kkstore02 \
	`pwd`/DEF > cat.run.out 2>&1 &
    #	34 minutes to the next failure:
    cd /cluster/data/hg17/bed/blastzMonDom2.2005_08_22/axtChain/run
    #	to allow script to continue after we fixup this broken part
    para time > run.time
    cat run.time
# Completed: 45 of 46 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:      20950s     349.17m     5.82h    0.24d  0.001 y
# IO & Wait Time:                   782s      13.03m     0.22h    0.01d  0.000 y
# Average job time:                 483s       8.05m     0.13h    0.01d
# Longest finished job:            2779s      46.32m     0.77h    0.03d
# Submission to last job:          2841s      47.35m     0.79h    0.03d

    #	34 minutes to the next failure:
# Batch failed after 4 tries on chain.csh chr19.nib:chr19: chain/chr19.nib:chr19:.chain
# Command failed:
# ssh -x pk /cluster/data/hg17/bed/blastzMonDom2.2005_08_22/axtChain/run/doChainRun.csh
    #	This is most likely because we aren't working on the kki cluster
    #	Need to get to larger machines.  Go to kolossus to run the last job
    ssh kolossus
    cd /cluster/data/hg17/bed/blastzMonDom2.2005_08_22/axtChain/run
    time ./chain.csh chr19.nib:chr19: chain/chr19.nib:chr19:.chain
    #	real    477m4.130s
    #	user    228m6.042s
    #	sys     162m31.903s
    #	That is a lot of time on kolossus, almost 8 hours !
    #	Creating a 2 Gb file:
    #	-rw-rw-r--  1 2030235712 Aug 23 22:48 chr19.nib:chr19:.chain

    #	To continue the run, back to pk
    ssh pk
    cd /cluster/data/hg17/bed/blastzMonDom2.2005_08_22
    time ./doBlastzChainNet.pl \
	-continue=chainMerge \
	-bigClusterHub=pk \
	-smallClusterHub=pk \
	-fileServer=kkstore02 \
	`pwd`/DEF > chainMerge.run.out 2>&1 &
    #	real    195m24.430s
    #	But it failed !
hgLoadChain hg17 chr19_chainMonDom2 chr19.chain
Out of memory needMem - request size 28 bytes

    #	Need to go back and rescore the chains at 5000
    #	Remove previous results
    ssh kkstore01
    cd /cluster/data/hg17/bed/blastzMonDom2.2005_08_22
    rm -fr axtNet mafNet axtChain

    ssh pk
    cd /cluster/data/hg17/bed/blastzMonDom2.2005_08_22
    time ./doBlastzChainNet.pl \
	-continue=chainRun \
	-chainMinScore=5000 \
	-bigClusterHub=pk \
	-smallClusterHub=pk \
	-fileServer=kkstore02 \
	`pwd`/DEF > chainRun.run.out 2>&1 &
# Completed: 45 of 46 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:      13835s     230.58m     3.84h    0.16d  0.000 y
# IO & Wait Time:                  2943s      49.06m     0.82h    0.03d  0.000 y
# Average job time:                 373s       6.21m     0.10h    0.00d
# Longest finished job:            1798s      29.97m     0.50h    0.02d
# Submission to last job:          1802s      30.03m     0.50h    0.02d

    #	Same problem as before with chr19
    ssh kolossus
    cd /cluster/data/hg17/bed/blastzMonDom2.2005_08_22/axtChain/run
    time ./chain.csh chr19.nib:chr19: chain/chr19.nib:chr19:.chain
    #	real    262m56.122s
    #	user    193m29.527s
    #	sys     68m42.523s
    #	-rw-rw-r--  1 1450265901 Aug 26 15:27 chain/chr19.nib:chr19:.chain

    #	To continue the run, back to pk
    ssh pk
    cd /cluster/data/hg17/bed/blastzMonDom2.2005_08_22
    time ./doBlastzChainNet.pl \
	-continue=chainMerge \
	-chainMinScore=5000 \
	-bigClusterHub=pk \
	-smallClusterHub=pk \
	-fileServer=kkstore02 \
	`pwd`/DEF > chainMerge.run.out 2>&1 &
XXXX
    #	swap results to place hg17 alignments onto monDom2
    cd /cluster/data/hg17/bed/blastzMonDom2.2005_08_22
    time ./doBlastzChainNet.pl -swap \
	-bigClusterHub=pk \
	-smallClusterHub=pk \
	-fileServer=kkstore02 \
	`pwd`/DEF > swap.run.out 2>&1 &
    #	real    17m13.957s
    #	user    0m0.395s
    #	sys     0m0.148s

##########################################################################
# SELF BLASTZ (TBD - 2005-08-22 - Hiram)
#	SELF track not actually that useful on this assembly, not done.
    #	Going to need a set of nibs for this thing to use as SEQ1.
    #	There's only about 5077 of them, so this is feasible.
    #	This wouldn't work if there were a lot more.
    ssh kkstore01
    mkdir /cluster/data/monDom2/maskedScaffolds
    cd /cluster/data/monDom2/maskedScaffolds
    faSplit byname ../maskedScaffolds.fa.gz .
    mkdir /cluster/data/monDom2/maskedNibs
    cd /cluster/data/monDom2/maskedNibs
    ls ../maskedScaffolds | while read S
do
    N=${S/.fa/}
    echo faToNib -softMask ../maskedScaffolds/${S} ./${N}.nib
    faToNib -softMask ../maskedScaffolds/${S} ./${N}.nib
done

    #	place them on the san filesystem for pk kluster run
    ssh pk
    rsync -a --progress /cluster/data/monDom2/maskedNibs/ \
	/san/sanvol1/scratch/monDom2/maskedNibs/

##########################################################################
# QUALITY (DONE - 2005-08-22 - Hiram)
    ssh kkstore01
    mkdir /cluster/data/monDom2/bed/quality
    cd /cluster/data/monDom2/bed/quality
    cp -p /cluster/data/monDom1/bed/quality/qaToWigEncode.pl .
    #	using perl script from previous build
    #	Update the file name in this file:
my $AGP_FILE="/cluster/data/monDom2/broad.mit.edu/assembly.agp";

    #	convert the qual.fa to wiggle single column input and then wigEncode:
    zcat ../../broad.mit.edu/V3.0_prel.agp.chromosome.qual.gz \
	| ./qaToWigEncode.pl /dev/stdin \
	| wigEncode stdin quality.wig quality.wib
    #	Converted stdin, upper limit 50.00, lower limit 0.00
    #	This is interesting, in monDom1 we had an upper limit of 68
    #	1 hour 34 minutes of processing time.
    #	Resulting files:
    ls -og quality.wi?
    #	-rw-rw-r--  1 3496445653 Aug 22 17:51 quality.wib
    #	-rw-rw-r--  1  360668080 Aug 22 17:51 quality.wig
    #	Note how the byte size of the .wib file is exactly equal
    #	to the number bases in a featureBits run without -countGaps
    #	from the rmsk featureBits above:
    #	1770704342 bases of 3496445653 (50.643%) in intersection
    #	This means we have a valid quality score for every base that is
    #	not in a gap, which is what we want

    ssh hgwdev
    cd /cluster/data/monDom2/bed/quality
    ln -s `pwd`/quality.wib /gbdb/monDom2/wib
    time hgLoadWiggle -pathPrefix=/gbdb/monDom2/wib monDom2 quality quality.wig
    #	5 minute load time
    #	verify index:
    hgsql -e "show index from quality;" monDom2
    #	check that Cardinality is a number and is not NULL

##########################################################################
# PRODUCING GENSCAN PREDICTIONS (DONE - 2005-08-23 - Hiram)
#	RE-DONE with new masked sequence 2006-01-23 - Hiram

    #	Need a hard masked file
    ssh kolossus
    mkdir /scratch/monDom2
    cd /scratch/monDom2
    rsync -a --progress kkstore01:/cluster/data/monDom2/maskedScaffolds.fa .
    time maskOutFa maskedScaffolds.fa hard hardMaskedScaffolds.fa
    #	1m 25sec
    #	send the result back  (these rsync operations are faster than
    #	simply copying over NFS)
    ssh kkstore01
    cd /cluster/data/monDom2
    time rsync -a --progress kolossus:/scratch/monDom2/hardMaskedScaffolds.fa .
    #	break all these hard masked scaffolds at 5,000,000 boundaries,
    #	result is to a single file.  The tiny scaffolds under that
    #	size remain intact.
    time faSplit -lift=genscan.lft -oneFile size \
	hardMaskedScaffolds.fa 5000000 genscan_ 
    #	5677 pieces of 5677 written
    #	real    2m1.146s

    #	save lift file for later liftUp
    cp -p genscan.lft /cluster/data/monDom2/scripts
    #	Now split that single file into 200,000 chunks, keeping fa record
    #	boundaries, this has the effect of putting all the tiny
    #	scaffolds together in larger single files.  The 5,000,000 scaffold
    #	chunks will be their own separate files.
    mkdir hardMasked200K
    time faSplit about genscan_.fa 200000 hardMasked200K/c_
    #	real    2m5.636s
    #	makes 1169 files:
    ls hardMasked200K | wc
    #	1169    1169   10590

    #	Clean out the /cluster/bluearc/monDom2/ files that were used
    #	during repeat masking, then, this copy
    mkdir /san/sanvol1/scratch/monDom2/hardMasked200K/
    cd /san/sanvol1/scratch/monDom2/hardMasked200K/
    rsync -a --progress /cluster/data/monDom2/hardMasked200K/ .

    ssh hgwdev
    mkdir /cluster/data/monDom2/bed/genscan
    cd /cluster/data/monDom2/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux
    # Make 3 subdirectories for genscan to place output files
    mkdir gtf pep subopt

    #	This should not be done on the pk - these 5,000,000 sized chunks
    #	cause too much memory to be used for the kk nodes.
    #	These jobs are too large to run on the kk - they occupy about 1 Gb
    #	of memory which is the memory size of a kk node, two of them on
    #	the same node cause too much swapping.
    ssh pk
    cd /cluster/data/monDom2/bed/genscan
    ls -1S /san/sanvol1/scratch/monDom2/hardMasked200K/c*.fa > chunks.list
    cat << '_EOF_' > template
#LOOP
gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    gensub2 chunks.list single template jobList
    para create jobList
    para try, check, push, check, ...
XXX running 2006-01-23
# Completed: 1169 of 1169 jobs
# CPU time in finished jobs:     228087s    3801.45m    63.36h    2.64d  0.007 y
# IO & Wait Time:               1300618s   21676.96m   361.28h   15.05d  0.041 y
# Average job time:                1308s      21.80m     0.36h    0.02d
# Longest finished job:            7290s     121.50m     2.02h    0.08d
# Submission to last job:          7372s     122.87m     2.05h    0.09d

    # Concatenate scaffold-level results:
    ssh kkstore01
    cd /cluster/data/monDom2/bed/genscan
    time cat gtf/*.gtf | liftUp genscan.gtf \
	/cluster/data/monDom2/scripts/genscan.lft error stdin
    time cat subopt/*.bed | liftUp genscanSubopt.bed \
	/cluster/data/monDom2/scripts/genscan.lft error stdin
    cat pep/*.pep > genscan.pep


    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/monDom2/bed/genscan
    ldHgGene -gtf monDom2 genscan genscan.gtf
    #	40051 gene predictions
    hgPepPred monDom2 generic genscanPep genscan.pep
    hgLoadBed monDom2 genscanSubopt genscanSubopt.bed
    #	Loaded 492210 elements of size 6

    #	Compare with other assemblies:
    featureBits monDom2 genscan
    #	48628209 bases of 3496445653 (1.391%) in intersection
    featureBits monDom1 genscan
    #	47170795 bases of 3492108230 (1.351%) in intersection
    featureBits monDom2 genscanSubopt
    #	52714123 bases of 3496445653 (1.508%) in intersection
    featureBits monDom1 genscanSubopt
    #	51805835 bases of 3492108230 (1.484%) in intersection
    featureBits hg17 genscan
    #	55323340 bases of 2866216770 (1.930%) in intersection
    featureBits hg17 genscanSubopt
    #	55986178 bases of 2866216770 (1.953%) in intersection

    # Clean up
    cd /san/sanvol1/scratch/monDom2
    rm -fr hardMasked200K
    #	This kolossus clean up was actually left until much later.
    #	I found these copies here to be useful later on.
    ssh kolossus
    cd /scratch/monDom2
    rm -f *
    cd ..
    rmdir monDom2

##########################################################################
# MAKE Human Proteins track (WORKING - 2005-08-24 - Hiram)
    ssh eieio
    mkdir -p /cluster/data/monDom2/blastDb
    cd /cluster/data/monDom2/blastDb
    faSplit sequence ../broad.mit.edu/scaffolds.fa.unmasked 500 x 
    for i in x*
    do
	formatdb -i $i -p F
    done
    rm *.log *.fa

    ssh kkr1u00
    mkdir -p /iscratch/i/monDom2/blastDb
    cp /cluster/data/monDom2/blastDb/* /iscratch/i/monDom2/blastDb
    (iSync) > sync.out
    exit # back to eieio

    mkdir -p /cluster/data/monDom2/bed/tblastn.hg17KG
    cd /cluster/data/monDom2/bed/tblastn.hg17KG
    ls -1S /iscratch/i/monDom2/blastDb/*.nsq | sed "s/\.nsq//" > query.lst

    calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(350000/`wc query.lst | awk "{print \\\$1}"`\)
    # 42156/(350000/499) = 60.102411
    mkdir -p /cluster/bluearc/monDom2/bed/tblastn.hg17KG/kgfa
    split -l 65 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/monDom2/bed/tblastn.hg17KG/kgfa/kg
    ln -s /cluster/bluearc/monDom2/bed/tblastn.hg17KG/kgfa kgfa
    cd kgfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.2
	if pslCheck -prot $3.tmp
	then
	    mv $3.tmp $3
	    rm -f $f.1 $f.2 
	fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/monDom2/bed/tblastn.hg17KG
    para create blastSpec
    para push
# Completed: 323851 of 323851 jobs
# CPU time in finished jobs:  113015986s 1883599.76m 31393.33h 1308.06d  3.584 y
# IO & Wait Time:               2732896s   45548.27m   759.14h   31.63d  0.087 y
# Average job time:                 357s       5.96m     0.10h    0.00d
# Longest job:                     4911s      81.85m     1.36h    0.06d
# Submission to last job:        274144s    4569.07m    76.15h    3.17d

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    ssh kki
    cd /cluster/data/monDom2/bed/tblastn.hg17KG
    para create chainSpec
    para push
# Completed: 649 of 649 jobs
# CPU time in finished jobs:       4726s      78.76m     1.31h    0.05d  0.000 y
# IO & Wait Time:                 76824s    1280.41m    21.34h    0.89d  0.002 y
# Average job time:                 126s       2.09m     0.03h    0.00d
# Longest job:                      312s       5.20m     0.09h    0.00d
# Submission to last job:          7487s     124.78m     2.08h    0.09d

    exit
    # back to eieio
    cd /cluster/data/monDom2/bed/tblastn.hg17KG/blastOut
    for i in kg??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n  | uniq  > /cluster/data/monDom2/bed/tblastn.hg17KG/blastHg17KG.psl
    cd ..

    ssh hgwdev
    cd /cluster/data/monDom2/bed/tblastn.hg17KG
    hgLoadPsl monDom2 blastHg17KG.psl

    # back to eieio
    rm -rf blastOut
# End tblastn

##########################################################################
#	blastz self (NOT POSSIBLE - 2005-08-25 - Hiram)
#	This does not work because it requires over 25,000,000
#	kluster jobs.  This procedure would have to be done in a
#	different way.
#	We do not need the self blastz on this assembly, it is not useful.
    ssh pk
    mkdir /cluster/data/monDom2/bed/blastzSelf
    cd /cluster/data/monDom2/bed/blastzSelf

    cat << '_EOF_' > DEF
# monDom2 vs monDom2
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run.v7
BLASTZ=blastz.v7
#       Since we are running this without lineage specific repeats,
#       set BLASTZ_M to limit a single sequence matching 1000 times
BLASTZ_M=1000
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: MonDom2
SEQ1_DIR=/san/sanvol1/scratch/monDom2/maskedNibs
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: MonDom2
SEQ2_DIR=/san/sanvol1/scratch/monDom2/monDom2.2bit
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/monDom2/bed/blastzSelf

SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    #	happy emacs

    twoBitInfo /san/sanvol1/scratch/monDom2/monDom2.2bit stdout | \
	sort -rn +1  > S1.len
    cp -p S1.len S2.len

    #	establish a screen to control this job
    #	altered local copy of doBlastzChainNet.pl to fix santest location
    #	and run everything on pk since the san location is unique here
    screen
    #	to detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh pk
    screen -d -r
    time ./doBlastzChainNet.pl \
	-bigClusterHub=pk \
	-smallClusterHub=pk \
	-fileServer=kkstore01 \
	`pwd`/DEF > blast.run.out 2>&1 &
    #	Fails on the gensub2 trying to create the jobList file
    #	when it reaches some memory limit

##########################################################################
# BLASTZ Mouse mm7 (WORKING - 2006-01-24 - Hiram)
    ssh pk
    mkdir /cluster/data/mm7/bed/blastzMonDom2.2006-01-24
    cd /cluster/data/mm7/bed/blastzMonDom2.2006-01-24

    cat << '_EOF_' > DEF
# Mouse vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Mouse (mm7)
SEQ1_DIR=/scratch/hg/mm7/nib
SEQ1_LEN=/scratch/hg/mm7/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Opossum monDom2
SEQ2_DIR=/scratch/hg/monDom2/monDom2.2bit
SEQ2_LEN=/scratch/hg/monDom2/chrom.sizes
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzMonDom2.2006-01-24
TMPDIR=/scratch/tmp
'_EOF_'
    #	happy emacs

    cd /cluster/data/mm7/bed/blastzMonDom2.2006-01-24
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF > blastz.out 2>&1 &
    #	Started 2006-01-24 13:55

##########################################################################
# BLASTZ Mouse mm6 (STARTED - 2005-08-25 - Hiram)(OBSOLETE - see above Mm7)
    #	copied mm6 nib files from /cluster/data/mm6/nib to
    #	/san/sanvol1/scratch/mm6/nib
    ssh pk
    mkdir /cluster/data/mm6/bed/blastzMonDom2.2005_08_25
    cd /cluster/data/mm6/bed/blastzMonDom2.2005_08_25
    cat << '_EOF_' > DEF
# Mm6 vs monDom2
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run.v7
BLASTZ=blastz.v7
# Set up blastz parameters using parameters between (chicken and fish ?)
# but not abridging repeats since can't do that with scaffolds, and
# it's not very relevant at this evolutionary distance.
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Mm6
SEQ1_DIR=/san/sanvol1/scratch/mm6/nib
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: MonDom2
SEQ2_DIR=/san/sanvol1/scratch/monDom2/monDom2.2bit
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm6/bed/blastzMonDom2.2005_08_25

SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << keep emacs coloring happy

    cp -p /cluster/data/mm6/chrom.sizes ./S1.len
    twoBitInfo /san/sanvol1/scratch/monDom2/monDom2.2bit stdout | \
	sort -rn +1  > S2.len

    #	establish a screen to control this job
    #	altered local copy of doBlastzChainNet.pl to fix santest location
    #	and run everything on pk since the san location is unique here
    screen
    #	to detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh pk
    screen -d -r
    time ./doBlastzChainNet.pl \
	-chainMinScore=5000 \
	-bigClusterHub=pk \
	-smallClusterHub=pk \
	-fileServer=kkstore01 \
	`pwd`/DEF > blast.run.out 2>&1 &
XXX - STARTED - 2005-08-25 16:32

############################################################################
#  BLATSERVERS ENTRY (DONE - 2005-08-31 - Hiram)
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("monDom2", "blat16", "17780", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("monDom2", "blat16", "17781", "0", "1");' \
	    hgcentraltest

############################################################################
# BLASTZ SWAP FOR ZEBRAFISH (danRer3) (DONE, 2005-10-19, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS

    # do swap of danRer3 vs. monDom2 chain and net alignments to 
    # create monDom2 vs. danRer3. see makeDanRer3.doc for details.
    ssh hgwdev
    cd /cluster/data/danRer3/bed/blastz.monDom2/chromsAndScafsRun
    mkdir -p /san/sanvol1/scratch/monDom2/monDom2vsdanRer3Out
    nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
       -bigClusterHub=pk \
       -blastzOutRoot /san/sanvol1/scratch/monDom2/monDom2vsdanRer3Out \
       -swap -chainMinScore=5000 >& doSwap.log &
    # Start: Tue Oct 18 21:36 Finished: Oct 18 22:22
# Parameters used for danRer3 vs monDom2 BLASTZ:
# BLASTZ_H=2000
# BLASTZ_Y=3400
# BLASTZ_L=10000
# BLASTZ_K=2200
# BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
# BLASTZ_ABRIDGE_REPEATS=0
# Add entries for danRer3 chains and net to trackDb.ra for monDom2 and
# modify descriptions to describe the process using scaffolds for danRer3
# chrNA and chrUn.

