# for emacs: -*- mode: sh; -*-

# Tursiops truncatus  Bottle-nosed dolphin
#########################################################################
# DOWNLOAD SEQUENCE (DONE braney 2008-10-07)
    ssh kkstore05
    mkdir /cluster/store12/turTru1
    ln -s /cluster/store12/turTru1 /cluster/data
    mkdir /cluster/data/turTru1/broad
    cd /cluster/data/turTru1/broad

    wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/bottlenosedDolphin/Turtru1.0/assembly.agp \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/bottlenosedDolphin/Turtru1.0/assembly.bases.gz \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/bottlenosedDolphin/Turtru1.0/assembly.quals.gz 
    md5sum ass* > assembly.md5sum

    qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin turTru1.qual.qac

   cut -f 1 assembly.agp | uniq -c | wc -l 
   # Number of scaffolds: 116467


#########################################################################
# Create .ra file and run makeGenomeDb.pl(DONE braney 2008-10-07)
    ssh kolossus
    cd /cluster/data/turTru1
cat << _EOF_ >turTru1.config.ra
# Config parameters for makeGenomeDb.pl:
db turTru1
clade mammal
genomeCladePriority 35
scientificName  Tursiops truncatus
commonName Dolphin
assemblyDate Feb. 2008
assemblyLabel Broad Institute turTru1 
orderKey 236.5
#mitoAcc AJ222767
mitoAcc none
fastaFiles /cluster/data/turTru1/broad/assembly.bases.gz
agpFiles /cluster/data/turTru1/broad/assembly.agp
qualFiles /cluster/data/turTru1/broad/turTru1.qual.qac
dbDbSpeciesDir dolphin
_EOF_

# use 'screen' make sure on kkstore05
    makeGenomeDb.pl -workhorse kolossus -verbose=2 turTru1.config.ra > makeGenomeDb.out 2>&1 &

    cut -f 2 chrom.sizes | ave stdin
# Q1 1243.000000
# median 1593.000000
# Q3 6511.000000
# average 21628.860415
# min 336.000000
# max 1406907.000000
# count 116467
# total 2519048486.000000
# standard deviation 63795.083568

#########################################################################
## Repeat Masker (DONE - 2008-10-16 - Hiram)
    screen	# to manage this several day job
    mkdir /hive/data/genomes/turTru1/bed/repeatMasker
    cd /hive/data/genomes/turTru1/bed/repeatMasker
    time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
	-workhorse=hgwdev -bigClusterHub=swarm \
	-buildDir=`pwd` turTru1 > do.log 2>&1 &
    #	real    797m7.301s
    twoBitToFa turTru1.rmsk.2bit stdout | faSize stdin > faSize.rmsk.txt
# 2519048486 bases (220604396 N's 2298444090 real 1290940163 upper 1007503927
# lower) in 116467 sequences in 1 files
# %40.00 masked total, %43.83 masked rea

#########################################################################
# SIMPLE REPEATS TRF (DONE - 2008-10-15 - Hiram)
    screen # use a screen to manage this job
    mkdir /hive/data/genomes/turTru1/bed/simpleRepeat
    cd /hive/data/genomes/turTru1/bed/simpleRepeat
    # 
    time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \
	-buildDir=/cluster/data/turTru1/bed/simpleRepeat turTru1 > do.log 2>&1 &
    #	real    74m17.044s
    cat fb.simpleRepeat
    #	36542185 bases of 2298444090 (1.590%) in intersection

    #	after RM run is done, add this mask:
    cd /hive/data/genomes/turTru1
    rm turTru1.2bit
    twoBitMask turTru1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed turTru1.2bit
    #	can safely ignore warning about >=13 fields in bed file

    twoBitToFa turTru1.2bit stdout | faSize stdin > turTru1.2bit.faSize.txt
# 2519048486 bases (220604396 N's 2298444090 real 1290364660 upper 1008079430
# lower) in 116467 sequences in 1 files
# %40.02 masked total, %43.86 masked rea
    #	link to gbdb
    ln -s `pwd`/turTru1.2bit /gbdb/turTru1

###########################################################################
# prepare for kluster runs (DONE _ 2008-10-16 - Hiram)
    # compare to size of real bases to adjust the repMatch
    #	hg18: 2881421696
    #	turTru1: 2298444090
    # thus: 1024 * 2298444090/2881421696 = 816
    #	rounding up to 850 for a more conservative masking
    cd /hive/data/genomes/turTru1
    time blat turTru1.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=turTru1.11.ooc -repMatch=850
    #	Wrote 31521 overused 11-mers to turTru1.11.ooc
    #	real    2m9.494s

    #	and staging data for push to kluster nodes
    mkdir /hive/data/staging/data/turTru1
    cp -p turTru1.2bit chrom.sizes turTru1.11.ooc \
	/hive/data/staging/data/turTru1
    #	request to cluster admin to push this to the kluster nodes
    #	/scratch/data/

###########################################################################
# add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram)
    hgsql -e 'update dbDb set
sourceName="Broad Institute turTru1 (NCBI project 20367, ABRN01000000)" where name="turTru1";' hgcentraltest

############################################################################
#  turTru1 - Hedgehog - Ensembl Genes version 51  (DONE - 2008-12-04 - hiram)
    ssh swarm
    cd /hive/data/genomes/turTru1
    cat << '_EOF_' > turTru1.ensGene.ra
# required db variable
db turTru1
# do we need to translate geneScaffold coordinates
geneScaffolds yes
# ignore genes that do not properly convert to a gene pred, and contig
#	names that are not in the UCSC assembly
skipInvalid yes
# ignore the 14,543 genes (of 26,340) that do not map to UCSC coordinates
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl -ensVersion=51 turTru1.ensGene.ra
    #	This did not pass the check during load of coverage of
    #	peptides vs. gene counts
    ssh hgwdev
    cd /hive/data/genomes/turTru1/bed/ensGene.51
    featureBits turTru1 ensGene
    #	10421083 bases of 2298444090 (0.453%) in intersection

 *** All done!  (through the 'makeDoc' step)
 *** Steps were performed in /hive/data/genomes/turTru1/bed/ensGene.51

############################################################################
