# Bos Taurus -- University of Maryland UMD Release 3.0 (Aug 25 2009)
#
#	"$Id: bosTauMd3.txt,v 1.4 2009/11/30 19:10:04 galt Exp $"
#
# creating minimal build in order make liftover files between MD3 and Btau4
###########################################################################

# set up main genome directory

ssh hgwdev
cd /hive/data/genomes
mkdir bosTauMd3
cd bosTauMd3

# DOWNLOAD SEQUENCE (DONE - 2009-11-18 - Galt)

mkdir download
cd download

# get sequence from UMD

wget --timestamping -nd 'ftp://ftp.cbcb.umd.edu/pub/data/Bos_taurus/Bos_taurus_UMD_3.0/bos_taurus.agp'
wget --timestamping -nd 'ftp://ftp.cbcb.umd.edu/pub/data/Bos_taurus/Bos_taurus_UMD_3.0/bos_taurus.fa.gz'

# fixup ids for agp
#  change ">Chr" 
#    to   ">chr"
mv bos_taurus.agp bos_taurus.orig.agp
cat bos_taurus.orig.agp | sed 's/\(^Chr\)/chr/' > bos_taurus.agp

# back to the main directory
cd /hive/data/genomes/bosTauMd3

# Run automation to make the basic genome

    cat << '_EOF_' > bosTauMd3.config.ra
# Config parameters for makeGenomeDb.pl:
db bosTauMd3
clade mammal
scientificName Bos Taurus
assemblyDate Aug. 2009
assemblyLabel Univ. Maryland Release 3.0
orderKey 236
dbDbSpeciesDir cow
mitoAcc 60101824
commonName Cow
taxId 9913
fastaFiles /hive/data/genomes/bosTauMd3/download/bos_taurus.fa.gz
agpFiles /hive/data/genomes/bosTauMd3/download/bos_taurus.agp
subsetLittleIds Y
'_EOF_'
    # << happy emacs


time makeGenomeDb.pl bosTauMd3.config.ra > & makeGenomeDb.pl.out &
# took 11 minutes

#note: I added subsetLittleIds option to config.ra 
# because I had checked the .agp which was ok,
# but the script complained that there were some extra
# sequences in the fasta file.  This option allows
# the system to ignore those, as long as everything
# in column 6 of agp is found in the fasta file(s).

featureBits -countGaps bosTauMd3 gap
#20737263 bases of 2660922743 (0.779%) in intersection

cat chrom.sizes | awk '{sum+=$2;print sum,$0}'
#2660922743
# similar total

# TODO
# Organism Image
wget -O /usr/local/apache/htdocs/images/Aplysia_californica.jpg \
 'http://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/Aplysia_californica.jpg/250px-Aplysia_californica.jpg'

# TODO
# Edit and check-in templates for description.html, gold.html, gap.html, bosTauMd3/trackDb.ra

# repeat mask
# for next time, put this under the bed/ dir
mkdir repeatMasker
cd repeatMasker
time doRepeatMasker.pl bosTauMd3 -buildDir=`pwd` > & doRepeatMasker.pl.out &


cat faSize.rmsk.txt 
#%47.84 masked total, %48.22 masked real

featureBits -countGaps bosTauMd3 rmsk
#1273525727 bases of 2660922743 (47.860%) in intersection

# simple repeat masker trf
cd /hive/data/genomes/bosTauMd3/bed
mkdir simpleRepeat
cd simpleRepeat
time doSimpleRepeat.pl bosTauMd3 -buildDir=`pwd` > & doSimpleRepeat.pl.out &


# failed only on chrM which doesn't matter
# perhaps someone will look into why bigTrf fails on chrM
ssh swarm
cd /hive/data/genomes/bosTauMd3/bed/simpleRepeat/run.cluster
/parasol/bin/para time > run.time

time doSimpleRepeat.pl -continue filter -buildDir `pwd` bosTauMd3 \
  >> & doSimpleRepeat.pl.out &


featureBits -countGaps bosTauMd3 simpleRepeat
#29450984 bases of 2660922743 (1.107%) in intersection

# make final masked .2bit
cd /hive/data/genomes/bosTauMd3
twoBitMask bosTauMd3.rmsk.2bit -add bed/simpleRepeat/trfMask.bed bosTauMd3.2bit
#Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it
#might contain block coordinates, but this program uses only the first three
#fields
# (the entire span -- no support for blocks).
# this seems to be generic output we always see.

############################################################################
#	prepare cluster data (DONE - 2009-11-20 - Galt)

ssh hgwdev
cd /hive/data/genomes/bosTauMd3

# create gbdb symlink
ln -s `pwd`/bosTauMd3.2bit /gbdb/bosTauMd3/

time blat bosTauMd3.2bit /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024
#Wrote 27298 overused 11-mers to 11.ooc
#151.299u 3.434s 3:06.07 83.1%   0+0k 0+0io 3pf+0w

mkdir /hive/data/staging/data/bosTauMd3
cp -p bosTauMd3.2bit /hive/data/staging/data/bosTauMd3
cp -p 11.ooc /hive/data/staging/data/bosTauMd3
cp -p chrom.sizes /hive/data/staging/data/bosTauMd3

# ask admin to sync this directory: /hive/data/staging/data/bosTauMd3/
#	to the kluster nodes /scratch/data/bosTauMd3/

#############################################################################
# LIFTOVER TO bosTau4 (DONE - 2009-11-23 - Galt )
    mkdir /hive/data/genomes/bosTauMd3/bed/blat.bosTau4.2009-11-23
    cd /hive/data/genomes/bosTauMd3/bed/blat.bosTau4.2009-11-23
    time nice +19 doSameSpeciesLiftOver.pl -verbose=2 \
	-bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \
	 bosTauMd3 bosTau4 >& do.log
    # it actually ran out of space on /scratch/tmp (used over 16GB temporarily)
    # and I moved the temp dir somewhere else and
    # finished the net step.  Then I did the load and cleanup steps.


###################################################

#Change bosTauMd3 in hgcentraltest.dbDb to active=0:

hgsql hgcentraltest

update dbDb set active=0 where name='bosTauMd3';
# changed because liftover is confusing otherwise
# when people are expecting successive versions
# of the same assembly rather than alternate assemblies.
update dbDb set description='Aug. 2009 (UMD3)' where name='bosTauMd3';
