#########################################################################
# Erinaceus europaeus -- European Hedgehog
# Broad eriEur1 2X release

#########################################################################
# Download sequence (2006-03-04 kate)

    ssh kkstore05
    mkdir -p /cluster/store12/eriEur1
    ln -s /cluster/store12/eriEur1 /cluster/data/eriEur1
    cd /cluster/data/eriEur1
    mkdir downloads
    cd downloads
    wget -nd -r ftp://ftp.broad.mit.edu/pub/assemblies/mammals/hedgehog/eriEur1

#########################################################################
# Create .ra file and run makeGenomeDb.pl
    ssh hgwdev
    cd /cluster/data/eriEur1
    cat << '_EOF_' >eriEur1.config.ra
# Config parameters for makeGenomeDb.pl:
db eriEur1
clade mammal
genomeCladePriority 37
scientificName Erinaceus europaeus
commonName hedgehog
assemblyDate June 2006
assemblyLabel Broad Institute eriEur1 (Draft_v1)
orderKey 200
mitoAcc 5835792
fastaFiles /cluster/data/eriEur1/downloads/assembly.bases.gz
agpFiles /cluster/data/eriEur1/downloads/assembly.agp
qualFiles /cluster/data/eriEur1/downloads/scaffold.lifted.qac
dbDbSpeciesDir hedgehog
'_EOF_'

    makeGenomeDb.pl -verbose=2 -stop=seq eriEur1.config.ra > makeGenomeDb.out &

    # Need dbDb entry for name lookup
    hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
        defaultPos, active, orderKey, genome, scientificName, \
        htmlPath, hgNearOk, hgPbOk, sourceName) \
        VALUES("eriEur1", "June 2006", "/gbdb/eriEur1", "Hedgehog", \
        "", 0, 100, "Hedgehog", \
        "Erinaceus europaeus", "", 0, 0, \
        "Broad Institute eriEur1 (Draft_v1)")' -h localhost hgcentraltest

################################################
## WINDOWMASKER (2006-03-04 kate)
    ssh kkstore05
    cd /cluster/data/eriEur1/bed/
    nice /cluster/bin/scripts/doWindowMasker.pl \
        -workhorse=kolossus eriEur1 >& wmRun.log &

    ln -s WindowMasker.2007-03-05 WMRun
    mv wmRun.log WMRun
    cd WMRun

    # upper-case n's left by WM (request to Andy to fix this)
    twoBitToFa eriEur1.wmsk.sdust.2bit stdout | tr n N | \
            faToTwoBit stdin /cluster/data/eriEur1/eriEur1.2bit

    # stats on masking
    cd /cluster/data/eriEur1
    twoBitToFa eriEur1.2bit stdout | nice faSize stdin >& faSize.log &
#3367787358 bases (1234652522 N's 2133134836 real 1047679039 upper 1085455797 lower) in 379802 sequences in 1 files
#Total size: mean 8867.2 sd 18005.2 min 201 (scaffold_126826) max 555390 (scaffold_371434) median 2901

    # 50.6% masked
    # Mouse and rat are ~43%, from RM.  TRF gives them an extra 2%, skip here

    mkdir -p /san/sanvol1/scratch/eriEur1
    cp -p eriEur1.2bit chrom.sizes /san/sanvol1/scratch/eriEur1

    #	load this table after creating a db (DONE - 2008-10-21 - Hiram)
    cd /hive/data/genomes/eriEur1/bed/WindowMasker.2007-03-05
    hgLoadBed eriEur1 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 10735601 elements of size 3
    #	real    3m25.230s

################################################
# DOWNLOADS (2007-06-05 kate)

    ssh kkstore05
    cd /cluster/data/eriEur1
    mkdir bigZips
    cd bigZips
    nice twoBitToFa ../eriEur1.2bit eriEur1.fa
    cp ../downloads/assembly.agp eriEur1.agp
    nice gzip eriEur1.fa eriEur1.agp
    md5sum *.gz > md5sum.txt

    ssh hgwdev
    set d = /usr/local/apache/htdocs/goldenPath
    set bd = /cluster/data/eriEur1
    cp $d/sorAra1/bigZips/README.txt $bd/bigZips
    # edit 
    mkdir -p $d/eriEur1/bigZips
    ln -s $bd/bigZips/{*.gz,md5sum.txt,README.txt} $d/eriEur1/bigZips
   
##############################################################################
## Repeat Masker (DONE - 2008-10-21 - Hiram)
    screen	# to manage this several day job
    mkdir /hive/data/genomes/eriEur1/bed/repeatMasker
    cd /hive/data/genomes/eriEur1/bed/repeatMasker
    time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
	-workhorse=hgwdev -bigClusterHub=swarm \
	-buildDir=`pwd` eriEur1 > do.log 2>&1 &
    #	real    307m30.737s

    twoBitToFa eriEur1.rmsk.2bit stdout | faSize stdin > faSize.rmsk.txt
# 3367787358 bases (1234652522 N's 2133134836 real 1749034540 upper 384100296
# lower) in 379802 sequences in 1 files
# %11.41 masked total, %18.01 masked real

#########################################################################
# SIMPLE REPEATS TRF (DONE - 2008-10-21 - Hiram)
    screen # use a screen to manage this job
    mkdir /hive/data/genomes/eriEur1/bed/simpleRepeat
    cd /hive/data/genomes/eriEur1/bed/simpleRepeat
    # 
    time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \
	-buildDir=/cluster/data/eriEur1/bed/simpleRepeat eriEur1 > do.log 2>&1 &
    #	real    74m17.044s
    #	failed during load due to no eriEur1 database present, add that
    #	(see below) and run the doLoad.csh manually:
    /hive/data/genomes/eriEur1/bed/simpleRepeat
    time ./doLoad.csh
    #	real    2m34.038s
    cat fb.simpleRepeat
    #	47016125 bases of 2133134836 (2.204%) in intersection

    #	after RM run is done, add this mask:
    cd /hive/data/genomes/eriEur1
    #	do NOT disturb the existing WindowMasked sequence
    twoBitMask eriEur1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed \
	eriEur1.rmTrf.2bit
    #	can safely ignore warning about >=13 fields in bed file

    twoBitToFa eriEur1.rmTrf.2bit stdout | faSize stdin \
	> eriEur1.rmTrf.faSize.txt
# 3367787358 bases (1234652522 N's 2133134836 real 1747981326 upper 385153510
# lower) in 379802 sequences in 1 files
# %11.44 masked total, %18.06 masked rea

    #	link to gbdb
    #	do NOT replace the existing WindowMasked sequence
    # ln -s `pwd`/eriEur1.2bit /gbdb/eriEur1

#########################################################################
# creating DB to make it easier to work with this (DONE - 2008-10-20 - Hiram)
    cd /hive/data/genomes/eriEur1/downloads
    qaToQac assembly.quals.gz stdout \
    | qacAgpLift assembly.agp stdin eriEur1.quals.qac
    cd /hive/data/genomes/eriEur1
    # edit eriEur1.config.ra to set:
    #	qualFiles /cluster/data/eriEur1/downloads/eriEur1.quals.qac
    makeGenomeDb.pl -workhorse=hgwdev -continue=agp -stop=agp \
	eriEur1.config.ra > makeAgp.log 2>&1
    time makeGenomeDb.pl -workhorse=hgwdev -continue=db -stop=db \
	eriEur1.config.ra > makeDb.log 2>&1
    #	real    32m3.304s
    makeGenomeDb.pl -workhorse=hgwdev -continue=dbDb -stop=dbDb \
	eriEur1.config.ra > makeDbDb.log 2>&1

#########################################################################
# prepare for kluster runs (DONE _ 2008-10-21 - Hiram)
    # compare to size of real bases to adjust the repMatch
    #	hg18: 2881421696
    #	eriEur1: 2133134836
    # thus: 1024 * 2133134836/2881421696 = 758
    #	rounding up to 800 for a bit more conservative masking
    cd /hive/data/genomes/eriEur1
    time blat eriEur1.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=eriEur1.11.ooc -repMatch=800
    #	Wrote 43503 overused 11-mers to eriEur1.11.ooc
    #	real    2m23.915s

    #	and staging data for push to kluster nodes
    mkdir /hive/data/staging/data/eriEur1
    cp -p eriEur1.2bit chrom.sizes eriEur1.11.ooc \
	/hive/data/staging/data/eriEur1
    #	request to cluster admin to push this to the kluster nodes
    #	/scratch/data/

###########################################################################
# add NCBI identifiers to the dbDb (DONE - 2008-11-17 - Hiram)
    hgsql -e 'update dbDb set
sourceName="Broad Institute eriEur1 (Draft_v1, NCBI project 12575)" where name="eriEur1";' hgcentraltest
    hgsql -e 'update dbDb set
organism="Hedgehog" where name="eriEur1";' hgcentraltest

############################################################################
#  eriEur1 - Hedgehog - Ensembl Genes version 51  (DONE - 2008-12-04 - hiram)
    ssh kkr14u08
    cd /hive/data/genomes/eriEur1
    cat << '_EOF_' > eriEur1.ensGene.ra
# required db variable
db eriEur1
# do we need to translate geneScaffold coordinates
geneScaffolds yes
# ignore genes that do not properly convert to a gene pred, and contig
#       names that are not in the UCSC assembly
skipInvalid yes
# ignore the three genes that have invalid structures from Ensembl:
# 4691: ENSEEUT00000004188 no exonFrame on CDS exon 7
# 35795: ENSEEUT00000003156 no exonFrame on CDS exon 4
# 40908: ENSEEUT00000001064 no exonFrame on CDS exon 2
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl -ensVersion=51 eriEur1.ensGene.ra
    ssh hgwdev
    cd /hive/data/genomes/eriEur1/bed/ensGene.51
    featureBits eriEur1 ensGene
    #	21932124 bases of 2133134836 (1.028%) in intersection

 *** All done!  (through the 'makeDoc' step)
 *** Steps were performed in /hive/data/genomes/eriEur1/bed/ensGene.51

############################################################################
