# for emacs: -*- mode: sh; -*-

# Vicugna vicugna

#########################################################################
# DOWNLOAD SEQUENCE 
    ssh kkstore05
    mkdir /cluster/store12/vicVic1
    ln -s /cluster/store12/vicVic1 /cluster/data
    mkdir /cluster/data/vicVic1/broad
    cd /cluster/data/vicVic1/broad

    wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/alpaca/assembly.agp \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/alpaca/assembly.bases.gz \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/alpaca/assembly.quals.gz 


qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin vicVic1.qual.qac

    wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/alpaca/BasicStats.out


# --------------------------------------------------------------------------------
# Sun May 25 15:19:02 2008 run (pid=20827) using Mon May 19 16:28:59 EDT 2008
# make
# BasicStats PRE=/wga/dev/WGAdata DATA=projects/LowCoverage/Vicugna
# \
#            RUN=run/work SUBDIR=assisted_2.5 QUAL_STATS=True
# 	   \
# 	              OUTFILE=BasicStats.out
# 		      --------------------------------------------------------------------------------
# 
# 		      Supercontigs having < 3 reads or < 1kb sequence are
# 		      ignored.
# 		      4189 gaps <= -1000; 7 gaps <= -10000; 0 gaps <= -100000
# 		      fraction of gaps < -10kb or more than 4 deviations below
# 		      zero: 0.812%
# 		      27886 gaps > 10kb, 0 gaps > 50kb, 0 gaps > 200kb, 0 gaps
# 		      > 1Mb
# 		      80.92% of reads were used in the assembly (83.85% of
# 		      bases, 85.43% of Q20 bases)
# 		      0% of reads were used multiply in the assembly
# 		      621891 contigs, having N50 length 4346
# 		      total contig length: 1826863513, spanning 2856433242
# 		      bases (with 36% in gaps)
# 		      199050 supercontigs, having N50 length 203550 (not
# 		      including gaps)
# 		      45.6% of assembly in supers of size < 200000 (1302053740
# 		      bases)
# 		      Assembly base coverage: 2.17X.  Assembly Q20 coverage:
# 		      1.93X.
# 		      99.98% of bases have q >= 1
# 		      96.62% of bases have q >= 20
# 		      93.57% of bases have q >= 30
# 		      90.13% of bases have q >= 40
# 		      86.95% of bases have q >= 50


   cut -f 1 assembly.agp | uniq -c | wc -l 
   # Number of scaffolds: 298420


#########################################################################
# Create .ra file and run makeGenomeDb.pl
    ssh kkstore05
    cd /cluster/data/vicVic1
cat << _EOF_ >vicVic1.config.ra
# Config parameters for makeGenomeDb.pl:
db vicVic1
clade mammal
genomeCladePriority 35
scientificName  Vicugna vicugna
commonName Alpaca
assemblyDate Jul. 2008
assemblyLabel Broad Institute vicVic1 
orderKey 233.5
#mitoAcc AJ222767
mitoAcc none
fastaFiles /cluster/data/vicVic1/broad/assembly.bases.gz
agpFiles /cluster/data/vicVic1/broad/assembly.agp
qualFiles /cluster/data/vicVic1/broad/vicVic1.qual.qac
dbDbSpeciesDir alpaca
_EOF_

# use 'screen' make sure on kkstore05
    makeGenomeDb.pl -verbose=2 vicVic1.config.ra > makeGenomeDb.out 2>&1 &

# 'ctl-a ctl -d' returns to previous shell
cut -f 2 chrom.sizes | ave stdin
# Q1 1042.000000
# median 1372.000000
# Q3 2292.000000
# average 9926.782253
# min 600.000000
# max 5516956.000000
# count 298420
# total 2962350360.000000
# standard deviation 67784.633210


#########################################################################
# REPEATMASKER (DONE - 2008-07-13 braney )
    ssh kkstore05
    screen # use a screen to manage this job
    mkdir /cluster/data/vicVic1/bed/repeatMasker
    cd /cluster/data/vicVic1/bed/repeatMasker
    doRepeatMasker.pl -buildDir=/cluster/data/vicVic1/bed/repeatMasker \
        vicVic1 > do.log 2>&1 &

    # Note: can run simpleRepeats simultaneously
    #### When done with RM:
    ssh pk
    para time

# Completed: 7141 of 7141 jobs
# CPU time in finished jobs:   19375689s  322928.14m  5382.14h  224.26d  0.614 y
# IO & Wait Time:                185390s    3089.84m    51.50h    2.15d  0.006 y
# Average job time:                2739s      45.65m     0.76h    0.03d
# Longest finished job:           13925s     232.08m     3.87h    0.16d
# Submission to last job:        175075s    2917.92m    48.63h    2.03d

    time nice -n +19 featureBits vicVic1 rmsk > fb.vicVic1.rmsk.txt 2>&1 &
    # 614774346 bases of 1923010363 (31.969%) in intersection

    # RepeatMasker and lib version from do.log:
    #    Jun 13 2008 (open-3-2-5) version of RepeatMasker
    # CC   RELEASE 20080611;     

#########################################################################
# SIMPLE REPEATS TRF (DONE - 2008-07-11 braney)
    ssh kkstore05
    screen # use a screen to manage this job
    mkdir /cluster/data/vicVic1/bed/simpleRepeat
    cd /cluster/data/vicVic1/bed/simpleRepeat
    # had to change genus/species to "vicugna genus" in dummyRun
    doSimpleRepeat.pl -buildDir=/cluster/data/vicVic1/bed/simpleRepeat \
	vicVic1 > do.log 2>&1 &

    #### When done
    ssh pk
    para time
# Completed: 60 of 60 jobs
# CPU time in finished jobs:      71877s    1197.95m    19.97h    0.83d  0.002 y
# IO & Wait Time:                  2811s      46.85m     0.78h    0.03d  0.000 y
# Average job time:                1245s      20.75m     0.35h    0.01d
# Longest finished job:           12819s     213.65m     3.56h    0.15d
# Submission to last job:         13133s     218.88m     3.65h    0.15d

    featureBits vicVic1 simpleRepeat
    # 77931003 bases of 1923010363 (4.053%) in intersection

    #	after RM run is done, add this mask:
    cd /cluster/data/vicVic1
    twoBitMask vicVic1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed vicVic1.2bit

    twoBitToFa vicVic1.2bit stdout | faSize stdin
# 2962350360 bases (1039339997 N's 1923010363 real 1305704427 upper 617305936
# lower) in 298420 sequences in 1 files
# Total size: mean 9926.8 sd 67784.7 min 600 (scaffold_298419) max 5516956
(scaffold_0) median 1372
# N count: mean 3482.8 sd 21549.2
# U count: mean 4375.4 sd 33856.2
# L count: mean 2068.6 sd 15148.2
# %20.84 masked total, %32.10 masked real

    twoBitToFa vicVic1.rmsk.2bit stdout | faSize stdin

# 2962350360 bases (1039339997 N's 1923010363 real 1309436724 upper 613573639
# lower) in 298420 sequences in 1 files
# Total size: mean 9926.8 sd 67784.7 min 600 (scaffold_298419) max 5516956
# (scaffold_0) median 1372
# N count: mean 3482.8 sd 21549.2
# U count: mean 4387.9 sd 33863.9
# L count: mean 2056.1 sd 15140.4
# %20.71 masked total, %31.91 masked real

    ln -s /cluster/data/vicVic1/vicVic1.2bit /gbdb/vicVic1/vicVic1.2bit

    cp /cluster/data/vicVic1/vicVic1.2bit /san/sanvol1/scratch/vicVic1
    cp /cluster/data/vicVic1/chrom.sizes /san/sanvol1/scratch/vicVic1
