# for emacs: -*- mode: sh; -*-

# Bos Taurus -- Baylor Release 3.1 (August 2006)


#########################################################################
# DOWNLOAD SEQUENCE (DONE Sept. 21, 2006 heather)
    ssh kkstore05
    mkdir /cluster/store12/bosTau3
    ln -s /cluster/store12/bosTau3 /cluster/data/bosTau3
    mkdir /cluster/data/bosTau3/baylor
    cd /cluster/data/bosTau3/baylor

    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20060815-freeze/ReadMeBovine.3.1.txt
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20060815-freeze/contigs/*
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20060815-freeze/markers/*

    mkdir chroms
    cd chroms
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20060815-freeze/linearScaffolds/*

NOTE: chrM is available from NCBI nucleotides NC_006853.
I didn't check for it soon enough to include it here, but it should
be included in bosTau4.

#########################################################################
# MAKE GENOME DB (September 2006 Heather)
# I got this right after a few attempts.
# There were 3 things I needed to change in the files from Baylor:
# I put the edits into /cluster/data/bosTau3/fixup.
#
# 1) Their AGP file used capitalized "Chr".  This is not recommended for us.
    ssh kkstore05
    cd /cluster/data/bosTau3
    sed -e 's/Chr/chr/' baylor/Btau20060815.agp > fixup/UCSC.agp
# 2) Their quality scores were reported by scaffold, not by chrom.
#    Convert to qac (UCSC compressed qual score format) and lift from AGP.
    zcat baylor/Btau20060815.contigs.fa.qual.gz | qaToQac stdin fixup/scaffolds.qac
    qacAgpLift fixup/UCSC.agp fixup/scaffolds.qac fixup/chrom.qac
# 3) They used a ".1" suffix in the contig and quality files.
#    An example line from the contig.fa file:
#    >gi|112123023|gb|AAFC03055291.1| Bos taurus Ctg58.CH240-395G16, whole genome shotgun sequence
#    We need just 
#    >AAFC03055291
#    Fix this with variations on a simple C program:

#include "common.h"
#include "linefile.h"

void doTrimHeader(char *inputFileName)
{
FILE *outputFileHandle = mustOpen("trimHeader.out", "w");
struct lineFile *lf = lineFileOpen(inputFileName, TRUE);
char *line;
int lineSize;
char *row[5], *contigId[2];

while (lineFileNext(lf, &line, &lineSize))
    {
    if (line[0] != '>')
        {
	fprintf(outputFileHandle, "%s\n", line);
	continue;
	}
    if (fileType == "contig")
        {
        chopString(line, "|", row, ArraySize(row));
        chopString(row[3], ".", contigId, ArraySize(contigId));
        fprintf(outputFileHandle, ">%s\n", contigId[0]);
	}
    else // qual scores
        {
        chopString(line, ".", contigId, ArraySize(row));
	fprintf(outputFileHandle, "%s\n", contigId[0]);
        }
    }

carefulClose(&outputFileHandle);
lineFileClose(&lf);
}

# At this point, attempt to run makeGenomeDb.pl:

    cat > bosTau3.config.ra <<EOF
# Config parameters for makeGenomeDb.pl:
db bosTau3
clade vertebrate
scientificName Bos Taurus
assemblyDate August 2006
assemblyLabel Baylor Release 3.1
orderKey 19
dbDbSpeciesDir cow
mitoAcc none
agpFiles /cluster/data/bosTau3/fixup/UCSC.agp
fastaFiles /cluster/data/bosTau3/fixup/trimHeader.contigs.fa.gz
qualFiles /cluster/data/bosTau3/fixup/trimHeader.qual.gz
EOF

    ~/kent/src/utils/makeGenomeDb.pl bosTau3.config.ra \
      >& makeGenomeDb.log & tail -f makeGenomeDb.log

# This fails as described in the log file:

# Error: IDs in fastaFiles
# do not perfectly match IDs in either the first or sixth columns of agpFiles 
# Please examine and then remove these temporary files:
  # /tmp/makeGenomeDb.fastaIds.W10474 -- fasta IDs
  # /tmp/makeGenomeDb.agpIds.E10485 -- AGP first column IDs ('big' sequences)
  # /tmp/makeGenomeDb.agpIds.T10489 -- AGP sixth column IDs ('little' sequences)
# Command failed:
  # ssh -x kkr6u00 nice /cluster/data/bosTau3/jkStuff/makeUnmasked2bit.csh

# Move tmp files to kkr6u00:/scratch/heather/bosTau3
# Create diff.clean which has 108 IDs
# Write to Kim Worley (kworley@bcm.tmc.edu)

# Response from Kim on Sept. 22, 2006:

# Those are sequences that were intentionally omitted from the AGP, because they
# were deemed to be contaminants after they were accessioned.
# Please omit them from your assembly.   

# We can use faSomeRecords to get rid of the bad IDs.
# First generate a sorted list of just the IDs in fixup/3.agp

cd /cluster/data/bosTau3/fixup
zcat trimHeader.contigs.fa.gz | faSomeRecords stdin 3.agp UCSC.bosTau3.fa  
gzip fixup/UCSC.bosTau3.fa
zcat trimHeader.qual.gz | faSomeRecords stdin 3.agp UCSC.qual
gzip fixup/UCSC.qual

# Edit bosTau3.config.ra
# fastaFiles /cluster/data/bosTau3/fixup/UCSC.bosTau3.fa.gz
# qualFiles /cluster/data/bosTau3/fixup/UCSC.qual.gz

# Drop database
ssh hgwdev
hgsql -e 'drop database bosTau3' mysql
# Backup files and rerun 
ssh kkstore02
cd /cluster/data/bosTau3
mv chrom.sizes chrom.sizes.1
mv bosTau3.agp bosTau3.agp.1
mv makeGenomeDb.log makeGenomeDb.log.1
/cluster/home/angie/kent/src/utils/makeGenomeDb.pl bosTau3.config.ra > & makeGenomeDb.log &

#########################################################################
# REPEATMASKER (DONE Oct 5, REDONE Oct 16, 2006 heather)
    ssh kkstore05
    cd /cluster/data/bosTau3/bed
    # Run -debug to create the dir structure and preview the scripts:
    ~/kent/src/utils/doRepeatMasker.pl bosTau3 -verbose 3 -debug
    # Run it for real and tail the log:
    ~/kent/src/utils/doRepeatMasker.pl bosTau3 -verbose 3 \
        >& RepeatMasker.2006-10-05/do.log &
    tail -f RepeatMasker.2006-10-05/do.log
    # RepeatMasker and lib version from do.log:
    #    March 20 2006 (open-3-1-5) version of RepeatMasker
    #CC   RELEASE 20060315;                                            *
    # Compare coverage to previous assembly:
    featureBits bosTau3 rmsk
    # 1200525422 bases of 2731807384 (43.946%) in intersection
    featureBits bosTau2 rmsk
    # 1291561904 bases of 2812203870 (45.927%) in intersection

    # Oct 16
    #    October 6 2006 (open-3-1-6) version of RepeatMasker
    #CC   RELEASE 20061006;                                            *
    featureBits bosTau3 rmsk
    # 1270629763 bases of 2731807384 (46.512%) in intersection


#########################################################################
# SIMPLE REPEATS (TRF) (Oct 2006/Feb 2007 heather)
    # look for an idle small cluster machine
    ssh -x kki parasol list machines | grep idle
    ssh kkr1u00
    nice tcsh
    mkdir /cluster/data/bosTau3/bed/simpleRepeat
    cd /cluster/data/bosTau3/bed/simpleRepeat
    twoBitToFa ../../bosTau3.unmasked.2bit stdout \
    | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
      -bedAt=simpleRepeat.bed -tempDir=/tmp \
    >& trf.log & tail -f trf.log

    # Make a filtered version for sequence masking:
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
    splitFileByColumn trfMask.bed trfMaskChrom

    # Load unfiltered repeats into the database:
    ssh hgwdev
    hgLoadBed bosTau3 simpleRepeat \
      /cluster/data/bosTau3/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    # 878 warnings
    # Compare coverage to previous assembly:
    featureBits bosTau3 simpleRepeat
    # 22475721 bases of 2731807384 (0.823%) in intersection
    featureBits bosTau2 simpleRepeat
    # 43628019 bases of 2812203870 (1.551%) in intersection

    # try again, spliting the fa files first this time
    # 13k files in one directory, seems okay...
    ssh kkstore05
    cd /cluster/data/bosTau3
    mkdir split
    cd split
    twoBitToFa bosTau3.unmasked.2bit bosTau3.unmasked.fa
    faSplit byname bosTau3.unmasked.fa .
    rm bosTau3.unmasked.fa
    cd /cluster/data/bosTau3/bed
    mkdir simpleRepeat4
    cd simpleRepeat4
    ./makeJobList-trf.csh
    # split into regular chroms and chrUn
    # regular chroms took about 90 minutes
    ./trf-run.csh > & ! trf.log &
    # chrUn took about 2 hours
    ./trf-run-chrUn.csh > & ! trfUn.log &
    ./concat.csh
    ./concatUn.csh
    # non chrUn 235926
    # chrUn 54713
    cp trf.bed all.bed
    cat trfUn.bed >> all.bed
    hgLoadBed bosTau3 simpleRepeat all.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    featureBits bosTau3 simpleRepeat
    # 22475721 bases of 2731807384 (0.823%) in intersection

    # Same result as first run!   Must be right
    # Not sure why it is less than bosTau2, maybe just the assembly is better

    # Make a filtered version for sequence masking:
    awk '{if ($5 <= 12) print;}' all.bed > trfMask.bed
    splitFileByColumn trfMask.bed trfMaskChrom

    # cleanup
    cd /cluster/data/bosTau3/bed
    mv simpleRepeat simpleRepeatOct13
    mv simpleRepeat2 simpleRepeatOct14
    mv simpleRepeat3 simpleRepeatFeb23
    mv simpleRepeat4 simpleRepeat


#########################################################################
# WINDOWMASKER (TESTING Nov 16, 2006 Andy)
    ssh hgwdev
    screen 
    time ~/kent/src/hg/utils/automation/doWindowMasker.pl \
         -workhorse kolossus bosTau3 >& bosTau3.wmsk.log
    # Note: windowmasker uses the internet so -workhorse is basically a 
    # required option.
    # Cntrl+A+D then tail -f on the bosTau3.wmsk.log and keep an eye on it.
    # Log into kolossus and run top.  Again, keep an eye on it.

#########################################################################
# ooc file
# use same repMatch value as bosTau2
    ssh kkr3u00
    cd /cluster/data/bosTau3
    mkdir bed/ooc
    cd bed/ooc
    ls -1 /cluster/data/bosTau3/bosTau3.2bit > nib.lst
    blat nib.lst /dev/null /dev/null -tileSize=11 -makeOoc=/cluster/bluearc/bosTau3/11.ooc -repMatch=1005

    ssh kkr1u00
    cd /iscratch/i/bosTau3
    cp -p /cluster/bluearc/bosTau3/11.ooc .
    # alternate to iSync suggested by Hiram
    /bin/bash
    for R in 2 3 4 5 6 7
    do
        rsync -a --progress ./ kkr${R}u00:/iscratch/i/bosTau3
    done


#########################################################################
# MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (Heather, Feb. 2007)

    ssh kolossus
    cd /cluster/data/bosTau3
    time twoBitMask bosTau3.rmsk.2bit -add bed/simpleRepeat/trfMask.bed bosTau3.2bit
    # Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which
    # means it might contain block coordinates, but this program uses only the
    # first three fields (the entire span -- no support for blocks).
    # 2.870u 6.542s 0:21.16 44.4%     0+0k 0+0io 5pf+0w

    # Link to it from /gbdb:
    ln -s /cluster/data/bosTau3/bosTau3.2bit /gbdb/bosTau3/bosTau3.2bit

    # Ask cluster-admin to bring up BLAT server

    echo 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) VALUES ("bosTau3", "blat11", 17782, 1, 0); \
          INSERT INTO blatServers (db, host, port, isTrans, canPcr) VALUES ("bosTau3", "blat11", 17783, 0, 1);' \
          | hgsql -h genome-testdb hgcentraltest


#########################################################################
# MAKE DOWNLOADABLE / GOLDENPATH FILES (Heather March 2007)
    cd /cluster/data/bosTau3
    ln -s /cluster/data/bosTau3/bed/RepeatMasker.2006-10-16/bosTau3.fa.out /cluster/data/bosTau3
    makeDownloads.pl bosTau3 -verbose=2 >& jkStuff/downloads.log &


#########################################################################
# PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (Heather, Feb. 2007)

    # pitakluster:
    ssh pk
    mkdir /san/sanvol1/scratch/bosTau3
    cp -p /cluster/data/bosTau3/bosTau3.2bit /san/sanvol1/scratch/bosTau3
    cp -p /cluster/data/bosTau3/chrom.sizes /san/sanvol1/scratch/bosTau3
    mkdir /san/sanvol1/scratch/bosTau3/rmsk
    cp -p /cluster/data/bosTau3/bosTau3.fa.out /san/sanvol1/scratch/bosTau3/rmsk

    # small cluster:
    ssh kkr1u00
    mkdir -p /iscratch/i/bosTau3
    cd /iscratch/i/bosTau3
    cp -p /cluster/data/bosTau3/bosTau3.2bit .
    cp -p /cluster/data/bosTau3/chrom.sizes .
    cp -p /cluster/data/bosTau3/11.ooc .
    # alternate to iSync suggested by Hiram
    /bin/bash
    for R in 2 3 4 5 6 7
    do
        rsync -a --progress ./ kkr${R}u00:/iscratch/i/bosTau3
    done

#########################################################################
# BLASTZ/CHAIN/NET HG18 (Done Feb 2007 heather)
# Do target = hg18 first because bosTau3 is 13k scaffolds (documented in hg18.txt)
# Then do swap

    mkdir /cluster/data/bosTau3/bed/blastz.hg18.2007-02-25
    ln -s /cluster/data/bosTau3/bed/blastz.hg18.2007-02-25 /cluster/data/bosTau3/bed/blastz.hg18
    cd /cluster/data/bosTau3/bed/blastz.hg18
    doBlastzChainNet.pl /cluster/data/hg18/bed/blastz.bosTau3/DEF -swap >& do.log &
    featureBits bosTau3 chainHg18Link
    # 1378293354 bases of 2731807384 (50.454%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET MM8 (Done March 2007 heather)
# Do target = mm8 first because bosTau3 is 13k scaffolds (documented in mm8.txt)
# Then do swap

    mkdir /cluster/data/bosTau3/bed/blastz.mm8.2007-03-23
    ln -s /cluster/data/bosTau3/bed/blastz.mm8.2007-03-23 /cluster/data/bosTau3/bed/blastz.mm8
    cd /cluster/data/bosTau3/bed/blastz.mm8
    doBlastzChainNet.pl /cluster/data/mm8/bed/blastz.bosTau3/DEF -swap >& do.log &
    featureBits bosTau3 chainMm8Link
    # 707594985 bases of 2731807384 (25.902%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET RN4 (Done March 2007 heather)
# Do target = rn4 first because bosTau3 is 13k scaffolds (documented in rn4.txt)
# Then do swap

    mkdir /cluster/data/bosTau3/bed/blastz.rn4.2007-03-26
    ln -s /cluster/data/bosTau3/bed/blastz.rn4.2007-03-26 /cluster/data/bosTau3/bed/blastz.rn4
    cd /cluster/data/bosTau3/bed/blastz.rn4
    doBlastzChainNet.pl /cluster/data/rn4/bed/blastz.bosTau3/DEF -swap >& do.log &
    featureBits bosTau3 chainRn4Link
    # 666169091 bases of 2731807384 (24.386%) in intersection

###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-04-24)
    ssh kkstore05
    # bash  if not using bash shell already

    mkdir /cluster/data/bosTau3/blastDb
    cd /cluster/data/bosTau3
    find split -name "c*" | xargs cat  > temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
    rm temp.fa
    cd blastDb
    for i in *.fa
    do
	/cluster/bluearc/blast229/formatdb -i $i -p F
    done
    echo *.fa | xargs rm

    mkdir -p /san/sanvol1/scratch/bosTau3/blastDb
    cd /cluster/data/bosTau3/blastDb
    for i in nhr nin nsq; 
    do 
	echo $i
	for j in *.$i
	do
	    cp $j /san/sanvol1/scratch/bosTau3/blastDb
	done
    done

    mkdir -p /cluster/data/bosTau3/bed/tblastn.hg18KG
    cd /cluster/data/bosTau3/bed/tblastn.hg18KG
    echo  /san/sanvol1/scratch/bosTau3/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 16114 query.lst

   # we want around 350000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(350000/`wc query.lst | awk "{print \\\$1}"`\)

# 36727/(350000/16114) = 1690.911080

# 36727/(250000/16114) = 2367.275512

   mkdir -p /cluster/bluearc/bosTau3/bed/tblastn.hg18KG/kgfa
   split -l 1690 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/bosTau3/bed/tblastn.hg18KG/kgfa/kg
   ln -s /cluster/bluearc/bosTau3/bed/tblastn.hg18KG/kgfa kgfa
   cd kgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
     done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   mkdir -p /cluster/bluearc/bosTau3/bed/tblastn.hg18KG/blastOut
   ln -s /cluster/bluearc/bosTau3/bed/tblastn.hg18KG/blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/bosTau3/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/bosTau3/blastDb.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3

        if pslCheck -prot $3.tmp                                                  
        then                                                                      
            mv $3.tmp $3                                                          
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0                                                                    
    fi                                                                            
fi                                                                                
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec
    exit 
    
    ssh pk
    cd /cluster/data/bosTau3/bed/tblastn.hg18KG
    para create blastSpec
#    para try, check, push, check etc.

# Completed: 354508 of 354508 jobs
# CPU time in finished jobs:   43134111s  718901.85m 11981.70h  499.24d  1.368 y
# IO & Wait Time:              23636059s  393934.32m  6565.57h  273.57d  0.749 y
# Average job time:                 188s       3.14m     0.05h    0.00d
# Longest finished job:            1330s      22.17m     0.37h    0.02d
# Submission to last job:        247330s    4122.17m    68.70h    2.86d

    ssh kkstore05
    cd /cluster/data/bosTau3/bed/tblastn.hg18KG
    for i in blastOut/*
    do  
	echo "cd $i; echo *.psl | xargs cat | pslSortAcc nohead chrom /tmp/ stdin ; cd ../.."
    done > sort.jobs

    sh -x sort.jobs

    ssh kkstore05
    cd /cluster/data/bosTau3/bed/tblastn.hg18KG
    mkdir chainRun
    cd chainRun
    tcsh
  cat << '_EOF_' > chainOne
simpleChain -prot -outPsl -maxGap=150000 $1 `dirname $1`/c.`basename $1`.psl
'_EOF_'

    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    chmod +x chainOne
    ls  ../blastOut/*/chrom/*.psl > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    ssh kk
    cd /cluster/data/bosTau3/bed/tblastn.hg18KG/chainRun
    para create chainSpec
    para maxNode 30
    para try, check, push, check etc.

# Completed: 660 of 682 jobs
# Crashed: 22 jobs
# CPU time in finished jobs:     249233s    4153.89m    69.23h    2.88d  0.008 y
# IO & Wait Time:                  4468s      74.46m     1.24h    0.05d  0.000 y
# Average job time:                 384s       6.41m     0.11h    0.00d
# Longest finished job:            2190s      36.50m     0.61h    0.03d
# Submission to last job:          9537s     158.95m     2.65h    0.11d

# all the chrUn jobs crashed on kk, re-do on kolossus

    ssh kkstore05
    cd /cluster/data/bosTau3/bed/tblastn.hg18KG/blastOut
    for i in kg??
    do
       cat $i/chrom/c.*.psl|awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/bosTau3/bed/tblastn.hg18KG/blastHg18KG.psl
    cd ..
    pslCheck blastHg18KG.psl

    # load table 
    ssh hgwdev
    cd /cluster/data/bosTau3/bed/tblastn.hg18KG
    hgLoadPsl bosTau3 blastHg18KG.psl

    # check coverage
    featureBits bosTau3 blastHg18KG 
# 35001855 bases of 2731807384 (1.281%) in intersection

    featureBits bosTau3 refGene:cds blastHg18KG  -enrichment
# refGene:cds 0.292%, blastHg18KG 1.281%, both 0.258%, cover 88.42%, enrich  69.01x

    ssh kkstore05
    rm -rf /cluster/data/bosTau3/bed/tblastn.hg18KG/blastOut
    rm -rf /cluster/bluearc/bosTau3/bed/tblastn.hg18KG/blastOut
#end tblastn
##########################################################################

# CpG Islands (Done, June 27 2007, Heather)

    ssh kkstore05
    cd /cluster/data/bosTau3/bed
    mkdir cpgIsland
    cd cpgIsland
    zcat ../../goldenPath/bigZips/bosTau3.fa.masked.gz | faSplit about stdin 2000000 split

    ssh hgwdev
    cd /cluster/data/bosTau3/bed/cpgIsland
    # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
    cvs co hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    make
    # gcc readseq.c cpg_lh.c -o cpglh.exe
    # cpg_lh.c: In function `main':
    # cpg_lh.c:74: warning: conflicting types for built-in function 'malloc'
    mv cpglh.exe /cluster/data/bosTau3/bed/cpgIsland/
    
    ssh kkstore05
    # could also have used kolossus
    cd /cluster/data/bosTau3/bed/cpgIsland
    cat << '_EOF_' > run.csh
#!/bin/csh -ef
foreach f (split*)
    set fout=$f:t:r:r.cpg
    echo $fout
    ./cpglh.exe $f > $fout
end
'_EOF_'
    # << this line makes emacs coloring happy
    run.csh

    # Transform cpglh output to bed +
    cat << '_EOF_' > filter.awk
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
    # << this line makes emacs coloring happy
    awk -f filter.awk *.cpg > cpgIsland.bed

    # load into database
    ssh hgwdev
    cd /cluster/data/bosTau3/bed/cpgIsland
    hgLoadBed bosTau3 cpgIslandExt -tab \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
    wc -l cpgIsland.bed 
    # 38189 cpgIsland.bed
    featureBits bosTau3 cpgIslandExt
    # 24374280 bases of 2731807384 (0.892%) in intersection


##########################################################################
# GENSCAN (June 2007 Heather)
# Assume none of the 200K scaffolds are all Ns
# (These cause Genscan to run forever)
    ssh hgwdev
    mkdir /cluster/data/bosTau3/bed/genscan
    cd /cluster/data/bosTau3/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    cvs co hg3rdParty/genscanlinux

    # generate hard-masked sequence
    ssh kkstore05
    cd /cluster/data/bosTau3/bed/genscan
    zcat ../../goldenPath/bigZips/bosTau3.fa.gz | maskOutFa stdin hard bosTau3.hardmask.fa
    mkdir split
    cd split
    faSplit about ../bosTau3.hardmask.fa 2000000 split
    # Generate file list and check that no files are completed masked
    cd ..
    cat << '_EOF_' > makeList.csh
    foreach f ( /cluster/data/bosTau3/bed/genscan/split/*.fa )
        egrep '[ACGT]' $f > /dev/null
        if ($status == 0) echo $f >> genome.list
    end
    '_EOF_'
    wc -l genome.list
    # 252 genome.list

    # Log into kki (not kk!).  kki is the driver node for the small
    # cluster (kkr1u00-kkr8u00). Genscan has a problem running on the
    # big cluster, due to limited memory and swap space on each
    # processing node.
    ssh kki
    cd /cluster/data/bosTau3/bed/genscan
    # Create template file, gsub, for gensub2.  For example (3-line file):
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    /parasol/bin/gensub2 genome.list single gsub jobList

    para create jobList
    para try
    para check
    para push 

    # chr9 (split08.fa) failed due to insufficient memory
    # Tried again on kki, still failed
    # Tried on kolossus, still failed
    # Tried on kolossus with -window=1000000, that worked

    # Concatenate
    ssh kkstore05
    cd /cluster/data/bosTau3/bed/genscan
    cat gtf/*.gtf > genscan.gtf
    cat pep/*.pep > genscan.pep
    cat subopt/*.bed > genscanSubopt.bed

    # Load into the database 
    ssh hgwdev
    cd /cluster/data/bosTau3/bed/genscan
    ldHgGene -gtf bosTau3 genscan genscan.gtf

    hgPepPred bosTau3 generic genscanPep genscan.pep
    hgLoadBed bosTau3 genscanSubopt genscanSubopt.bed

    featureBits bosTau3 genscan
    # 59251085 bases of 2731807384 (2.169%) in intersection
    featureBits bosTau3 genscanSubopt
    # 58996872 bases of 2731807384 (2.160%) in intersection
    # Should be zero intersection with rmsk
    featureBits bosTau3 genscan rmsk


##########################################################################
# MAKE SCAFFOLD LIFT FILE (DONE 11/14/07 angie)
# Bovine consortium is doing analysis in scaffold coords.  Make a liftUp
# for viewing in GB.
    cd /cluster/data/bosTau3/baylor
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20060815-freeze/individualScaffolds/Btau3.1.scaffolds.chr
    # Exclude chrUn from the lift file for two bosTau3-specific reasons:
    # chrUn scaffolds were kept separate, and they used a different prefix
    # (bosTau3 prefix is "chrUn.003" while Baylor scaffold prefix is just
    # ChrUn)
    sed -e '/ChrUn/ d; s/Chr30/ChrX/' Btau3.1.scaffolds.chr \
    | ~/kent/src/utils/baylorScaffoldChrToAgp.pl \
      ../chrom.sizes \
    | agpToLift -revStrand \
      > ../jkStuff/scaffolds.lft
    # Actually, we can add on some name-translating .lft lines for chrUn.*:
    perl -wpe '@w=split; if ($w[0] !~ /ChrUn/) { s/^.*\n$//; next; } \
               $w[0] =~ s/^.*Chr/Chr/; \
               $chr = $w[0];  $chr =~ s/ChrUn/chrUn.003/; \
               $size = $w[2] - $w[1] + 1;\
               s/^.*/0\t$w[0]\t$size\t$chr\t$size\t$w[3]/; \
      ' Btau3.1.scaffolds.chr \
    > /tmp/chrUn.lft
    # Make sure sizes agree:
    grep ^chrUn ../chrom.sizes > /tmp/1
    awk '{print $4 "\t" $5;}' /tmp/chrUn.lft > /tmp/2
    cmp /tmp/1 /tmp/2
    # Add chrUn lines to main .lft:
    cat /tmp/chrUn.lft >> ../jkStuff/scaffolds.lft


##########################################################################
# MAKE SCAFFOLDS TRACK (DONE 11/14/07 angie)
    ssh hgwdev
    mkdir /cluster/data/bosTau3/bed/scaffolds
    cd /cluster/data/bosTau3/bed/scaffolds
    # bosTau3's chrUn.* scaffolds are separate, so make this track only 
    # for the assembled chroms.
    perl -wpe 'if (/ChrUn/) {s/^.*\n$//; next;} \
           m/(Chr\w+)\.(\d+)\t(\d+)\t(\d+)\t([+N-])/ || die; \
           ($chr, $scafNum, $start, $end, $strand) = ($1, $2, $3, $4, $5); \
           $scafName = "$chr.$scafNum"; \
           $chr =~ s/^C/c/; $chr =~ s/chr30/chrX/; \
           $start--;  $strand =~ s/N/+/; \
           s/^.*$/$chr\t$start\t$end\t$scafName\t0\t$strand/;' \
        ../../baylor/Btau3.1.scaffolds.chr \
      > scaffolds.bed
    hgLoadBed bosTau3 scaffold scaffolds.bed


##########################################################################
# MAKE PER-CHROM FASTA DOWNLOADS (DONE 12/7/07 angie)
    ssh kkstore05
    mkdir /cluster/data/bosTau3/goldenPath/chromosomes
    cd /cluster/data/bosTau3/goldenPath/chromosomes
    awk '$1 ~ /^chrUn/ {print $1;}' ../../chrom.sizes > scafs.lst
    awk '$1 !~ /^chrUn/ {print $1;}' ../../chrom.sizes > chroms.lst
    foreach chr (`cat chroms.lst`)
      twoBitToFa ../../bosTau3.2bit stdout -seq=$chr \
      | gzip -c > $chr.fa.gz
    end
    twoBitToFa ../../bosTau3.2bit stdout -seqList=scafs.lst \
    | gzip -c > chrUn.scaffolds.fa.gz
    rm chroms.lst scafs.lst
    md5sum *.gz > md5sum.txt
    # Make README.txt
    ssh hgwdev ln -s /cluster/data/bosTau3/goldenPath/chromosomes \
      /usr/local/apache/htdocs/goldenPath/bosTau3/chromosomes


############################################################################
# SGP GENES (DONE - 2007-12-20 - Hiram)
    ssh kkstore05
    mkdir  /cluster/data/bosTau3/bed/sgp
    cd  /cluster/data/bosTau3/bed/sgp

    mkdir gtf
    #   chrUn is done in a different manner
for C in `grep -v chrUn /cluster/data/bosTau3/chrom.sizes | awk '{print $1}'`
do
    wget --timestamping \
"http://genome.imim.es/genepredictions/B.taurus/hgsc_Btau20060815/SGP/mm9/${C}.gtf" \
        -O "gtf/${C}.gtf"
done
    #	a single chrUn.gtf file has all the scaffolds
    wget --timestamping \
'http://genome.imim.es/genepredictions/B.taurus/hgsc_Btau20060815/SGP/mm9/chrUn.gtf' \
	-O "gtf/chrUn.gtf"

    ssh hgwdev
    cd /cluster/data/bosTau3/bed/sgp
    ldHgGene -gtf -genePredExt bosTau3 sgpGene gtf/chr*.gtf
    #	Read 41496 transcripts in 308500 lines in 31 files
    #	41496 groups 4767 seqs 1 sources 3 feature types
    #	41496 gene predictions

    featureBits bosTau3 sgpGene
    #	40658543 bases of 2731807384 (1.488%) in intersection

    featureBits bosTau3 -enrichment refGene:CDS sgpGene
    #	refGene:CDS 0.426%, sgpGene 1.488%, both 0.384%, cover 90.22%,
    #	enrich 60.62x

#####################################################################
# LOAD GENEID GENES (DONE - 2007-12-20, 2008-03-27 - Hiram)
    ssh kkstore05
    mkdir -p /cluster/data/bosTau3/bed/geneid
    cd /cluster/data/bosTau3/bed/geneid
    mkdir gtf prot

    #	The data load in December 2007 was broken at their end.
    #	download new data 27 March 2008 and reload this track

for C in `grep -v chrUn /cluster/data/bosTau3/chrom.sizes | awk '{print $1}'`
do
  echo $C
  wget --timestamping \
"http://genome.imim.es/genepredictions/B.taurus/hgsc_Btau20060815/geneidv1.2/${C}.gtf" \
	-O "gtf/${C}.gtf"
      wget --timestamping \
"http://genome.imim.es/genepredictions/B.taurus/hgsc_Btau20060815/geneidv1.2/${C}.prot" \
	-O "prot/${C}.prot"
    done
    wget --timestamping \
"http://genome.imim.es/genepredictions/B.taurus/hgsc_Btau20060815/geneidv1.2/chrUn.gtf" \
	-O "gtf/chrUn.gtf"
    wget --timestamping \
"http://genome.imim.es/genepredictions/B.taurus/hgsc_Btau20060815/geneidv1.2/chrUn.prot" \
	-O "prot/chrUn.prot"

    # Add missing .1 to protein id's
    cd prot
    foreach f (*.prot)
      perl -wpe 's/^(>chr\S+)$/$1.1/' $f > $f:r-fixed.prot
    end

    ssh hgwdev
    cd /cluster/data/bosTau3/bed/geneid
    ldHgGene -genePredExt -gtf bosTau3 geneid gtf/*.gtf
# Read 50657 transcripts in 325618 lines in 31 files
# 50657 groups 12601 seqs 1 sources 3 feature types
# 50657 gene predictions

    hgPepPred bosTau3 generic geneidPep prot/chr*-fixed.prot
    featureBits bosTau3 geneid
    #	44623578 bases of 2731807384 (1.633%) in intersection
    featureBits bosTau3 -enrichment refGene:CDS geneid
# refGene:CDS 0.429%, geneid 1.633%, both 0.364%, cover 84.64%, enrich 51.82x

#############################################################################
# LIFTOVER TO bosTau4 (WORKING - 2008-04-02 - Hiram)
    ssh kkstore05
    screen -r -d	#	use screen to control this job
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl -debug bosTau3 bosTau4 \
      -ooc /cluster/bluearc/bosTau3/11.ooc
    # Real run:
    cd /cluster/data/bosTau3/bed/blat.bosTau4.2008-04-02
    time nice -n +19 doSameSpeciesLiftOver.pl bosTau3 bosTau4 \
      -ooc /cluster/bluearc/bosTau3/11.ooc > do.log 2>&1 &
    #	real    587m45.013s

    #	this broke down on kolossus due to out of disk space on
    #	/scratch/tmp/ - so, go to one of the memk nodes, and finish
    #	the doNet.csh script
    ssh mkr0u0
    cd /cluster/data/bosTau3/bed/blat.bosTau4.2008-04-02/run.chain
    time ./doNet.csh
    #	real    81m17.919s
    #	then, continuing:
    ssh kkstore05
    cd /cluster/data/bosTau3/bed/blat.bosTau4.2008-04-02
    time nice -n +19 doSameSpeciesLiftOver.pl -continue=load bosTau3 bosTau4 \
	-buildDir=/cluster/data/bosTau3/bed/blat.bosTau4.2008-04-02 \
	-ooc /cluster/bluearc/bosTau3/11.ooc > load.log 2>&1 &
    #	real    1m17.778s

############################################################################
#  bosTau3 - Cow - Ensembl Genes (DONE - 2008-04-22 - hiram)
#	This one had to be done manually.  There was a chrUn lift file that
#	needed to be made to turn the Ensembl chrUn coordinates into
#	the UCSC chrUn.003.N contig coordinates
#	It was run with a geneScaffolds yes to fetch the MySQL tables,
#	Then a script was run:
    ssh hgwdev
    cd /cluster/data/bosTau3
    cat << '_EOF_' > jkStuff/chrUnLiftAcross.pl
#!/usr/bin/env perl

use strict;
use warnings;

open (FH,"<chrUn.seq_region.txt") or die "can not read chrUn.seq_region.txt";

my $start = 0;
my $end = 0;
my $gap = 10000;

while (my $line = <FH>) {
    chomp $line;
    my ($region_id, $name, $type, $size) = split('\s+', $line);
    $end = $start + $size;
    printf "chrUn\t%d\t%d\t%s\t%d\t%d\t+\n", $start, $end, $name, 0, $size;
    $start += $size + $gap;
}

close (FH);
'_EOF_'
    # << happy emacs
    chmod +x jkStuff/chrUnLiftAcross.pl
    cd bed/ensGene.49/download
    ../../../jkStuff/chrUnLiftAcross.pl > ../../../jkStuff/chrUn.liftAcross.txt
    #	then use that chrUn.liftAcross.txt in the process script procedure
    #	after that, comment out the gene Scaffolds and -continue=load
    ssh kkstore05
    cd /cluster/data/bosTau3
    cat << '_EOF_' > bosTau3.ensGene.ra
# required db variable
db bosTau3
# optional nameTranslation, the sed command that will transform
#       Ensemble names to UCSC names.  With quotes just to make sure.
nameTranslation "s/^\([0-9UX][0-9n]*\)/chr\1/; /^MT/d"
# cause SQL tables to be fetched to see if chrUn can be fixed up
# geneScaffolds yes
'_EOF_'
#  << happy emacs
    $HOME/kent/src/hg/utils/automation/doEnsGeneUpdate.pl \
	-ensVersion=49 -stop=process bosTau3.ensGene.ra
    #	do the manual fixups as described above, then continue
    doEnsGeneUpdate.pl -continue=load -ensVersion=49 bosTau3.ensGene.ra
    ssh hgwdev
    cd /cluster/data/bosTau3/bed/ensGene.49
    featureBits bosTau3 ensGene
    # 39278215 bases of 2731807384 (1.438%) in intersection

############################################################################
# SELF BLASTZ (DONE - 2008-06-30 - Hiram)
    #	do not want the noise from the contigs
    ssh kkstore05
    screen # use a screen to manage this multi-day job
    mkdir /cluster/data/bosTau3/noContigs
    cd /cluster/data/bosTau3/noContigs
    cut -f1 ../chrom.sizes | grep -v chrUn | while read C
do
    echo "twoBitToFa -seq=${C} ../bosTau3.2bit ${C}.fa"
    twoBitToFa -seq=${C} ../bosTau3.2bit ${C}.fa
done
    faToTwoBit chr*.fa bosTau3.noContigs.2bit

    twoBitToFa bosTau3.noContigs.2bit stdout | faSize stdin
# 2434234369 bases (141856351 N's 2292378018 real
#	1246068866 upper 1046309152 lower) in 30 sequences in 1 files
# %42.98 masked total, %45.64 masked real

    grep -v chrUn ../chrom.sizes | ave -col=2 stdin
# count 30
# total 2434234369.000000

    twoBitInfo bosTau3.noContigs.2bit stdout | sort -k2nr \
	> bosTau3.noContigs.chrom.sizes
    cp -p bosTau3.noContigs.2bit /cluster/bluearc/scratch/data/bosTau3
    cp -p bosTau3.noContigs.chrom.sizes /cluster/bluearc/scratch/data/bosTau3

    mkdir /cluster/data/bosTau3/bed/blastzSelf.2008-08-30
    cd /cluster/data/bosTau3/bed
    ln -s blastzSelf.2008-08-30 blastzSelf
    cd blastzSelf.2008-08-30

    cat << '_EOF_' > DEF
BLASTZ_M=400

# TARGET: Cow bosTau3
SEQ1_DIR=/cluster/bluearc/scratch/data/bosTau3/bosTau3.noContigs.2bit
SEQ1_LEN=/cluster/bluearc/scratch/data/bosTau3/bosTau3.noContigs.chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Cow bosTau3
SEQ2_DIR=/cluster/bluearc/scratch/data/bosTau3/bosTau3.noContigs.2bit
SEQ2_LEN=/cluster/bluearc/scratch/data/bosTau3/bosTau3.noContigs.chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/bosTau3/bed/blastzSelf.2008-08-30
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    cd /cluster/data/bosTau3/bed/blastzSelf.2008-08-30
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \
	`pwd`/DEF > blastz.out 2>&1 &
    #	failed during loading
XXXX - running 2008-06-30 11:49
    #	real    3537m49.719s

############################################################################
