# for emacs: -*- mode: sh; -*-

# Drosophila pseudoobscura -- Baylor "CAF1" via Eisen's 12-fly site
# (Identical to FlyBase dpse_r2.1_FB2008_02)

# THIS IS ONLY TO GET MASKED SEQUENCE -- NOT A BROWSER AT THIS POINT


#########################################################################
# DOWNLOAD SEQUENCE (DONE 9/26/06 angie)
    ssh kkstore05
    mkdir /cluster/store12/dp4
    ln -s /cluster/store12/dp4 /cluster/data/dp4
    mkdir /cluster/data/dp4/downloads
    cd /cluster/data/dp4/downloads
    wget http://rana.lbl.gov/drosophila/caf1/dpse_caf1.tar.gz
    tar xvzf dpse_caf1.tar.gz
    cd dpse
    faSize scaffolds.fa
#152738921 bases (6681752 N's 146057169 real 146057169 upper 0 lower) in 4896 sequences in 1 files
#Total size: mean 31196.7 sd 649312.0 min 101 (Unknown_singleton_1691) max 30794189 (Ch2) median 1734
#N count: mean 1364.7 sd 21011.7
#U count: mean 29831.9 sd 628747.5
#L count: mean 0.0 sd 0.0
    # Tweak their funny chromosome names (Ch*) to our pattern (chr*):
    sed -e 's/^>Ch/>chr/' scaffolds.fa > UCSC.fa
    sed -e 's/^Ch/chr/' assembly.agp > UCSC.agp


#########################################################################
# MAKE GENOME DB *UP TO DB STEP ONLY* (DONE 9/27/06 angie)
    ssh kkstore05
    cd /cluster/data/dp4
    cat > dp4.config.ra <<EOF
# Config parameters for makeGenomeDb.pl:
db dp4
clade insect
scientificName Drosophila pseudoobscura
assemblyDate Feb. 2006
assemblyLabel Baylor CAF1
orderKey 57
mitoAcc none
fastaFiles /cluster/data/dp4/downloads/dpse/UCSC.fa
agpFiles /cluster/data/dp4/downloads/dpse/UCSC.agp
dbDbSpeciesDir drosophila
EOF

    # Stop at db step so we can use featureBits, but don't do dbDb and trackDb
    # because we're not building an actual browser for now.
    makeGenomeDb.pl dp4.config.ra -stop=db \
      >& makeGenomeDb.log & tail -f makeGenomeDb.log
    # Because of the extremely long sequence names for unplaced sequences,
    # the chromInfo load command failed because the index string length was 
    # too short.  So I temporarily modified my ~/kent/src/hg/lib/chromInfo.sql
    # to lengthen the index, dropped the dp4 database and ran -continue db.


#########################################################################
# REPEATMASKER (DONE 9/27/06 angie)
    ssh kkstore05
    # Run -debug to create the dir structure and preview the scripts:
    doRepeatMasker.pl dp4 -verbose 3 -debug
    # Run it for real and tail the log:
    doRepeatMasker.pl dp4 -species drosophila -verbose 3 \
      >& /cluster/data/dp4/bed/RepeatMasker.2006-09-27/do.log &
    tail -f /cluster/data/dp4/bed/RepeatMasker.2006-09-27/do.log
    # RepeatMasker and lib version from do.log:
#    March 20 2006 (open-3-1-5) version of RepeatMasker
#CC   RELEASE 20060315;                                            *
    # Compare coverage to previous assembly:
    featureBits -chrom=chr2 dp4 rmsk
#1245499 bases of 29873196 (4.169%) in intersection
    featureBits -chrom=chr2 dp3 rmsk
#1140300 bases of 29702756 (3.839%) in intersection


#########################################################################
# SIMPLE REPEATS (TRF) (DONE 9/27/06 angie)
    ssh kolossus
    nice tcsh
    mkdir /cluster/data/dp4/bed/simpleRepeat
    cd /cluster/data/dp4/bed/simpleRepeat
    twoBitToFa ../../dp4.unmasked.2bit stdout \
    | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
      -bedAt=simpleRepeat.bed -tempDir=/tmp \
    >& trf.log & tail -f trf.log
    # ~50 minutes (longer than D. mel, must be because of the scaffolds)

    # Make a filtered version for sequence masking:
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed

    # Load unfiltered repeats into the database:
    ssh hgwdev
    hgLoadBed dp4 simpleRepeat \
      /cluster/data/dp4/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    # Compare coverage to previous assembly:
    featureBits -chrom=chr2 dp4 simpleRepeat
#530609 bases of 29873196 (1.776%) in intersection
    featureBits -chrom=chr2 dp3 simpleRepeat
#517077 bases of 29702756 (1.741%) in intersection


#########################################################################
# MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 9/27/06 angie)
    ssh kolossus
    cd /cluster/data/dp4
    time twoBitMask dp4.rmsk.2bit -add bed/simpleRepeat/trfMask.bed dp4.2bit
    # This warning is OK -- the extra fields are not block coordinates.
#Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks).
#0.201u 0.367s 0:01.78 31.4%     0+0k 0+0io 1pf+0w

    # Because this is a no-browser build (just masking for alignment)
    # I did not make the usual /gbdb/$db/$db.2bit link.


###########################################################################
# BLASTZ/CHAIN/NET DROANA3 (DONE 10/3/06 angie)
    ssh kkstore05
    mkdir /cluster/data/dp4/bed/blastz.droAna3.2006-10-02
    cd /cluster/data/dp4/bed/blastz.droAna3.2006-10-02
    cat << '_EOF_' > DEF
# D. pseudoobscura vs. D. ananassae

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. pseudoobscura
SEQ1_DIR=/iscratch/i/dp4/dp4.2bit
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dp4/chrom.sizes

# QUERY - D. ananassae
SEQ2_DIR=/iscratch/i/droAna3/droAna3.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droAna3/chrom.sizes

BASE=/cluster/data/dp4/bed/blastz.droAna3.2006-10-02
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /panasas/store/dp4droAna3 >& do.log &
    tail -f do.log
    ln -s blastz.droAna3.2006-10-02 /cluster/data/dp4/bed/blastz.droAna3


###########################################################################
# BLASTZ/CHAIN/NET DROWIL1 (DONE 10/3/06 angie)
    ssh kkstore05
    mkdir /cluster/data/dp4/bed/blastz.droWil1.2006-10-02
    cd /cluster/data/dp4/bed/blastz.droWil1.2006-10-02
    cat << '_EOF_' > DEF
# D. pseudoobscura vs. D. willistoni

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. pseudoobscura
SEQ1_DIR=/iscratch/i/dp4/dp4.2bit
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dp4/chrom.sizes

# QUERY - D. willistoni
SEQ2_DIR=/iscratch/i/droWil1/droWil1.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droWil1/chrom.sizes

BASE=/cluster/data/dp4/bed/blastz.droWil1.2006-10-02
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /panasas/store/dp4droWil1 >& do.log &
    tail -f do.log
    ln -s blastz.droWil1.2006-10-02 /cluster/data/dp4/bed/blastz.droWil1


#########################################################################
# SWAP DM3 CHAIN/NET (DONE 6/30/08 angie)
    ssh kkstore05
    mkdir /cluster/data/dp4/bed/blastz.dm3.swap
    cd /cluster/data/dp4/bed/blastz.dm3.swap
    doBlastzChainNet.pl -swap \
      /cluster/data/dm3/bed/blastz.dp4.2006-12-04/DEF >& do.log &
    tail -f do.log
    ln -s blastz.dm3.swap /cluster/data/dp4/bed/blastz.dm3


#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE 9/18/08 angie)
    # Use -repMatch=85 -- increased from dp3's 75 because this assembly
    # loses 9272 tiles w/75, as opposed to dp3's 6790 -- even if dp4 has 
    # more repetitive reads than dp3, that seems like too big of an increase
    # (would lose sensitivity).
    ssh kolossus
    blat /hive/data/genomes/dp4/dp4.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=/hive/data/genomes/dp4/11.ooc -repMatch=85
#Wrote 7218 overused 11-mers to /hive/data/genomes/dp4/11.ooc


#########################################################################
# LIFTOVER TO DP3 (DONE 9/18/08)
    doSameSpeciesLiftOver.pl -bigClusterHub=pk -workhorse=kolossus \
      -ooc=/hive/data/genomes/dp4/11.ooc dp4 dp3 -debug
# *** Steps were performed in /cluster/data/dp4/bed/blat.dp3.2008-09-18
    cd /cluster/data/dp4/bed/blat.dp3.2008-09-18
#NOTE FOR NEXT TIME: save log file!  (oops)
    doSameSpeciesLiftOver.pl -bigClusterHub=pk -workhorse=kolossus \
      -ooc=/hive/data/genomes/dp4/11.ooc dp4 dp3


#########################################################################
