# for emacs: -*- mode: sh; -*-


# This file describes how we made the browser database on the 
# Dog (Canis familiaris) July 2004 release.


# CREATE BUILD DIRECTORY (DONE 7/1/04 angie)
    ssh kksilo
    mkdir /cluster/store7/canFam1
    ln -s /cluster/store7/canFam1 /cluster/data/canFam1


# DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE 7/1/04 angie)
    mkdir /cluster/data/canFam1/M
    cd /cluster/data/canFam1/M
    # go to http://www.ncbi.nih.gov/ and search Nucleotide for 
    # "canis familiaris mitochondrion genome".  That shows the gi number:
    # 17737322
    # Use that number in the entrez linking interface to get fasta:
    wget -O chrM.fa \
      'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=17737322&dopt=FASTA'
    # Edit chrM.fa: make sure the long fancy header line says it's the 
    # Canis familiaris mitochondrion complete genome, and then replace the 
    # header line with just ">chrM".


# MAKE JKSTUFF AND BED DIRECTORIES (DONE 7/1/04 angie)
    # This used to hold scripts -- better to keep them inline in the .doc 
    # so they're in CVS.  Now it should just hold lift file(s) and 
    # temporary scripts made by copy-paste from this file.  
    mkdir /cluster/data/canFam1/jkStuff
    # This is where most tracks will be built:
    mkdir /cluster/data/canFam1/bed


# DOWNLOAD AGP, FASTA & QUAL (DONE 7/7/04 angie)
    ssh kksilo
    mkdir /cluster/data/canFam1/broad
    cd /cluster/data/canFam1/broad
    ftp ftp.broad.mit.edu
      # use username and password emailed by David Jaffe 7/3/04.
      prompt
      bin
      mget Dog1.0.agp Dog1.0.agp.chromosome.fasta.gz
      mget Dog1.0.agp.chromosome.qual.gz assembly.format supercontigs
      mget Dog1.0.masked.agp.chromosome.fasta.gz
      mget supercontigs.summary contigs.bases.gz
      bye
    # Loaded the above 7/5; Reloaded new versions on 7/6 of 
    # Dog1.0.agp Dog1.0.agp.chromosome.fasta.gz 
    # Dog1.0.agp.chromosome.qual.gz Dog1.0.masked.agp.chromosome.fasta.gz
    # And on 7/7 of Dog1.0.agp Dog1.0.agp.chromosome.fasta.gz 
    # Dog1.0.agp.chromosome.qual.gz


# UNPACK CHROM FASTA & AGP AND CROSS-CHECK (DONE 7/7/04 angie)
    # For some reason, the chromosome fasta has been split into 5000000-base
    # chunks.  Fix the headers so that there is one record per chromosome,
    # and make one .fa file per chromosome.  
    ssh kksilo
    cd /cluster/data/canFam1/broad
    cat > ../jkStuff/fixBroadChromHeaders.pl <<'_EOF_'
#!/usr/bin/perl -w
while (<>) {
  if (/^>(\w+)\.1-\d+/) {
    $chr = "chr$1";
    print ">$chr\n";
  } elsif (/^>\w+\.\d+000001-\d+/) {
    # Discard this extra header line. 
  } elsif (/^>/) {
    die "Cant parse line $.:\n$_";
  } else {
    print;
  }
}
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x ../jkStuff/fixBroadChromHeaders.pl
    mkdir chroms
    zcat Dog1.0.agp.chromosome.fasta.gz \
    | ../jkStuff/fixBroadChromHeaders.pl \
    | faSplit byname stdin chroms/
    faSize chroms/chr*.fa
#2519779136 bases (159950770 N's 2359828366 real) in 40 sequences in 40 files
#Total size: mean 62994478.4 sd 23674450.6 min 26492144 (chr38) max 126913094 (chrX) median 61172541
#N count: mean 3998769.2 sd 2953878.1
    # Make our usual chrom dir structure and move chr*.fa into it.
    cp /dev/null ../chrom.lst
    foreach f (chroms/chr*.fa)
      set chr = $f:t:r
      set c = `echo $chr | sed -e 's/^chr//;'`
      echo $c >> ../chrom.lst
      mkdir ../$c
      mv $f ../$c
    end
    echo M >> ../chrom.lst

    # Split AGP file into per-chrom AGP files in usual chrom dir structure.
    cd /cluster/data/canFam1
    foreach c (`cat chrom.lst`)
      grep -w "^chr$c" broad/Dog1.0.agp > $c/chr$c.agp
    end
    rm M/chrM.agp
    wc -l ?{,?}/chr*.agp
# 119853 total
    wc -l broad/Dog1.0.agp
# 119853 broad/Dog1.0.agp

    # checkAgpAndFa prints out way too much info -- keep the end/stderr only:
    cd /cluster/data/canFam1
    foreach agp (?{,?}/chr*.agp)
      set fa = $agp:r.fa
      echo checking consistency of $agp and $fa
      checkAgpAndFa $agp $fa | tail -1
    end
    

# UNPACK CHROM QUAL SCORES (DONE 7/7/04 angie)
    ssh kksilo
    cd /cluster/data/canFam1/broad
    # Reuse fixBroadChromHeaders.pl script created in previous step, 
    # compress to .qac for use in quality scores step later on.
    zcat Dog1.0.agp.chromosome.qual.gz \
    | ../jkStuff/fixBroadChromHeaders.pl \
    | qaToQac stdin fixedQuals.qac


# DOUBLE-CHECK BY BUILDING ALT CHROM FASTA FROM CONTIGS+AGP (DONE 7/7/04 angie)
    # This is a redundant check, very low chance of finding any problem 
    # since checkAgpAndFa passed.  It's also very slow.  So don't put this 
    # in the critical path to masked sequence -- run it in parallel.
    ssh kolossus
    cd /cluster/data/canFam1
    mkdir /cluster/bluearc/canFam1
    gunzip -c broad/contigs.bases.gz > /cluster/bluearc/canFam1/contigs.fa
    # The number of "real" bases here should be equal to the number of 
    # "real" bases in the chrom fasta -- and it is now on 7/7, whew.
    faSize /cluster/bluearc/canFam1/contigs.fa
#2359828366 bases (0 N's 2359828366 real) in 59927 sequences in 1 files
#Total size: mean 39378.4 sd 70249.8 min 352 (contig_41721) max 1356923 (contig_5971) median 8974
#N count: mean 0.0 sd 0.0
    foreach agp (?{,?}/chr*.agp)
      set fa = $agp:r.fa2
      echo building $fa from $agp
      awk '$5 == "W" {print $6;}' $agp > /tmp/contigs.lst
      faSomeRecords /cluster/bluearc/canFam1/contigs.fa  /tmp/contigs.lst \
        /tmp/contigs.fa
      agpToFa -simpleMultiMixed $agp $agp:t:r $fa /tmp/contigs.fa
    end
    faSize */chr*.fa2
#2519779136 bases (159950770 N's 2359828366 real) in 40 sequences in 40 files
#Total size: mean 62994478.4 sd 23674450.6 min 26492144 (chr38) max 126913094 (chrX) median 61172541
#N count: mean 3998769.2 sd 2953878.1
    foreach f (?{,?}/chr*.fa2)
      faCmp $f $f:r.fa
    end
    rm ?{,?}/chr*.fa2


# COMPARE CONTIG & CHROM NON-N BASE COUNTS (DONE 7/7/04 angie)
    # In the 7/6 assembly, faSize showed different "real" base counts
    # for contigs (2359828366) vs. chroms (2359035963): contigs have
    # 792403 more non-N bases than chroms.
    # Here's how I determined that the difference was explained by the 
    # omission of some contigs from the AGP.  
    # In the 7/7 assembly, all contigs are in the AGP so we're OK now.
    ssh kksilo
    cd /cluster/data/canFam1
    awk '$5 == "W" {print $6;}' broad/Dog1.0.agp | sort > /tmp/1
    uniq /tmp/1 > /tmp/2
    wc -l /tmp/[12]
#  59927 /tmp/1
#  59927 /tmp/2
# 119854 total
    zcat broad/contigs.bases.gz | g '^>' | sed -e 's/^>//' | sort > /tmp/3
    wc -l /tmp/3
#  59927 /tmp/3
    diff /tmp/1 /tmp/2
# (no difference ==> each contig appears only once)
    diff /tmp/1 /tmp/3 | egrep -v '^[0-9]+' | wc -l
#     0
    # Excellent!  0 contigs missing.  So we don't have to do this stuff,
    # but here it is in case we need it again in the future:
    set missers =  `diff /tmp/1 /tmp/3 | egrep -v '^[0-9]+' | sed -e 's/^> //'`
    zcat broad/contigs.bases.gz | faSize -detailed=on stdin > /tmp/4
    set total = 0
    foreach ctg ($missers)
      set size = `grep -w $ctg /tmp/4 | awk '{print $2;}'`
      echo "$ctg\t$size"
      set total = `expr $total + $size`
    end
    echo "total\t\t$total"
#total           0


# BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CONTIGS/GAPS (DONE 7/7/04 angie)
    ssh kksilo
    cd /cluster/data/canFam1
    foreach agp (?{,?}/chr*.agp)
      set fa = $agp:r.fa
      echo splitting $agp and $fa
      cp -p $agp $agp.bak
      cp -p $fa $fa.bak
      splitFaIntoContigs $agp $fa . -nSize=5000000
    end
    # No _random's in this assembly, so no need to clean up after them.
    # Make a "pseudo-contig" for processing chrM too:
    mkdir M/chrM_1
    sed -e 's/chrM/chrM_1/' M/chrM.fa > M/chrM_1/chrM_1.fa
    mkdir M/lift
    echo "chrM_1/chrM_1.fa.out" > M/lift/oOut.lst
    echo "chrM_1" > M/lift/ordered.lst
    set msize = `faSize M/chrM.fa | awk '{print $1;}'`
    echo "0\tM/chrM_1\t$msize\tchrM\t$msize" > M/lift/ordered.lft
    foreach f ( ?{,?}/chr*.fa.bak )
      faCmp $f $f:r
    end


# MAKE LIFTALL.LFT (DONE 7/7/04 angie)
    ssh kksilo
    cd /cluster/data/canFam1
    cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft


# CREATING DATABASE (DONE 7/6/04 angie)
    ssh hgwdev
    echo 'create database canFam1' | hgsql ''
    # Use df to make sure there is at least 75G free on hgwdev:/var/lib/mysql
    df -h /var/lib/mysql
#/dev/sdc1             1.8T  409G  1.3T  25% /var/lib/mysql


# CREATING GRP TABLE FOR TRACK GROUPING (DONE 7/6/04 angie)
    ssh hgwdev
    echo "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp" \
      | hgsql canFam1


# MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) NIBS (DONE 7/7/04 angie)
    # Make nib/, unmasked until RepeatMasker and TRF steps are done.
    # Do this now so we can load up RepeatMasker and run featureBits; 
    # can also load up other tables that don't depend on masking.  
    ssh kksilo
    cd /cluster/data/canFam1
    mkdir nib
    foreach f (?{,?}/chr*.fa)
      echo "nibbing $f"
      faToNib $f nib/$f:t:r.nib
    end

    # Make symbolic links from /gbdb/canFam1/nib to the real nibs.
    ssh hgwdev
    mkdir -p /gbdb/canFam1/nib
    foreach f (/cluster/data/canFam1/nib/chr*.nib)
      ln -s $f /gbdb/canFam1/nib
    end
    # Load /gbdb/canFam1/nib paths into database and save size info.
    cd /cluster/data/canFam1
    hgsql canFam1  < $HOME/kent/src/hg/lib/chromInfo.sql
    hgNibSeq -preMadeNib canFam1 /gbdb/canFam1/nib */chr*.fa
    echo "select chrom,size from chromInfo" | hgsql -N canFam1 > chrom.sizes
    # take a look at chrom.sizes, should be 41 lines
    wc chrom.sizes
#     41      82     603 chrom.sizes


# GOLD AND GAP TRACKS (DONE 7/7/04 angie)
    ssh hgwdev
    cd /cluster/data/canFam1
    hgGoldGapGl -noGl -chromLst=chrom.lst canFam1 /cluster/data/canFam1 .
    # featureBits fails if there's no chrM_gap, so make one:
    # echo "create table chrM_gap like chr1_gap" | hgsql canFam1
    # oops, that won't work until v4.1, so do this for the time being:
    hgsql canFam1 -e "create table chrM_gap select * from chr1_gap where 0=1"


# ACCESSIONS FOR CONTIGS (DONE 2004-09-15 kate)
#  Used for Assembly track details, and also ENCODE AGP's
    cd /cluster/data/canFam1/bed
    mkdir -p contigAcc
    cd contigAcc

    # extract WGS contig to accession mapping from Entrez Nucleotide summaries
    # To get the summary file, access the Genbank page for the project 
    # by searching:
    #       genus[ORGN] AND WGS[KYWD]
    # At the end of the page, click on the list of accessions for the contigs.
    # Select summary format, file, and click "send to".
    # output to file <species>.acc
    # 59927 items

    contigAccession dog.acc > contigAcc.tab
        #  59927 contigAcc.tab
    wc -l contigAcc.tab
    
    # verify this corresponds with the Assembly table
    # get summary stats for chrN_gold, using table browser 
    # items matching query    59927 

    # load into database
    hgsql canFam1 < $HOME/kent/src/hg/lib/contigAcc.sql
    echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \
        hgsql canFam1
    hgsql canFam1 -e "SELECT COUNT(*) FROM contigAcc"


# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR CANFAM1 (DONE 7/7/04 angie)
    ssh hgwdev
    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/trackDb
    cvs up -d -P
    # Edit that makefile to add canFam1 in all the right places and do
    make update
    cvs commit makefile
    mkdir -p dog/canFam1
    cvs add dog dog/canFam1
    cvs ci -m "trackDb dir for dog genome(s)" dog/canFam1
    make alpha

    # Add dbDb and defaultDb entries:
    echo 'insert into dbDb (name, description, nibPath, organism,  \
          defaultPos, active, orderKey, genome, scientificName,  \
          htmlPath, hgNearOk, hgPbOk, sourceName)  \
          values("canFam1", "Jul. 2004", \
          "/gbdb/canFam1/nib", "Dog", "chr14:10666612-10673232", 1, \
          18, "Dog", "Canis familiaris", \
          "/gbdb/canFam1/html/description.html", 0, 0, \
          "Broad Institute v. 1.0");' \
    | hgsql -h genome-testdb hgcentraltest
    # If there's another dog assembly, this will be an update not insert:
    echo 'insert into defaultDb (genome, name) values("Dog", "canFam1");' \
    | hgsql -h genome-testdb hgcentraltest


# REPEAT MASKING (DONE 7/7/04 angie)
    #- Split contigs into 500kb chunks, at gaps if possible:
    ssh kksilo
    cd /cluster/data/canFam1
    foreach c (`cat chrom.lst`)
      foreach d ($c/chr${c}*_?{,?})
        cd $d
        echo "splitting $d"
        set contig = $d:t
        faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
          -minGapSize=100
        cd ../..
      end
    end

    #- Make the run directory and job list:
    cd /cluster/data/canFam1
    cat << '_EOF_' > jkStuff/RMDog
#!/bin/csh -fe

cd $1
pushd .
/bin/mkdir -p /tmp/canFam1/$2
/bin/cp $2 /tmp/canFam1/$2/
cd /tmp/canFam1/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec dog $2
popd
/bin/cp /tmp/canFam1/$2/$2.out ./
if (-e /tmp/canFam1/$2/$2.align) /bin/cp /tmp/canFam1/$2/$2.align ./
if (-e /tmp/canFam1/$2/$2.tbl) /bin/cp /tmp/canFam1/$2/$2.tbl ./
if (-e /tmp/canFam1/$2/$2.cat) /bin/cp /tmp/canFam1/$2/$2.cat ./
/bin/rm -fr /tmp/canFam1/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/canFam1/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/canFam1
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMDog
    mkdir RMRun
    cp /dev/null RMRun/RMJobs
    foreach c (`cat chrom.lst`)
      foreach d ($c/chr${c}_?{,?})
          set ctg = $d:t
          foreach f ( $d/${ctg}_?{,?}.fa )
            set f = $f:t
            echo /cluster/data/canFam1/jkStuff/RMDog \
                 /cluster/data/canFam1/$d $f \
               '{'check out line+ /cluster/data/canFam1/$d/$f.out'}' \
              >> RMRun/RMJobs
          end
      end
    end

    #- Do the run
    ssh kk
    cd /cluster/data/canFam1/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...
#Completed: 6373 of 6373 jobs
#CPU time in finished jobs:   24949716s  415828.60m  6930.48h  288.77d  0.791 y
#IO & Wait Time:                220014s    3666.90m    61.12h    2.55d  0.007 y
#Average job time:                3949s      65.82m     1.10h    0.05d
#Longest job:                     6389s     106.48m     1.77h    0.07d
#Submission to last job:         31173s     519.55m     8.66h    0.36d

    #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
    ssh kksilo
    cd /cluster/data/canFam1
    foreach d (*/chr*_?{,?})
      set contig = $d:t
      echo $contig
      liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
        > /dev/null
    end

    #- Lift pseudo-contigs to chromosome level
    foreach c (`cat chrom.lst`)
      echo lifting $c
      cd $c
      liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` > /dev/null
      cd ..
    end

    #- Load the .out files into the database with:
    ssh hgwdev
    cd /cluster/data/canFam1
    hgLoadOut canFam1 */chr*.fa.out


# VERIFY REPEATMASKER RESULTS (DONE 7/7/04 angie)
    # Eyeball some repeat annotations in the browser, compare to lib seqs.
    # Run featureBits on canFam1 and on a comparable genome build, and compare:
    ssh hgwdev
    featureBits canFam1 rmsk
#896773874 bases of 2359845093 (38.001%) in intersection
    # Good, 35-40% seems typical for mammals...


# COMPARE OUR REPEATMASKER .OUT'S TO BROAD'S (DONE 7/23/04 angie)
    ssh kksilo
    cd /cluster/data/canFam1/broad
    ftp ftp.broad.mit.edu
      # same username & password as for fasta etc.
      bin
      get dog1.0_repeatinfo.tar.gz
      bye
    mkdir /cluster/data/canFam1/bed/compareRmsk
    cd /cluster/data/canFam1/bed/compareRmsk
    tar xvzf ../../broad/dog1.0_repeatinfo.tar.gz
    rm dog1.0_repeatinfo/*.index
    # compare... files?  definitely coverage --- make a bed4.
    cp /dev/null broadRmskBed4.bed
    foreach f (dog1.0_repeatinfo/chr*.fa.out)
      tail +4 $f | awk '{print $5 "\t" $6-1 "\t" $7 "\t" $10;}' \
      >> broadRmskBed4.bed
    end
    wc -l ../../?{,?}/chr*.fa.out | tail -1
#4337285 total
    wc -l dog1.0_repeatinfo/chr*.fa.out | tail -1
#4059835 total
    ssh hgwdev
    cd /cluster/data/canFam1/bed/compareRmsk
    #Compare coverage with our RepeatMasker run:
#896773874 bases of 2359845093 (38.001%) in intersection
    featureBits canFam1 broadRmskBed4.bed
#883152916 bases of 2359845093 (37.424%) in intersection
    featureBits canFam1 broadRmskBed4.bed \!rmsk -bed=notInRmsk.bed
#7053647 bases of 2359845093 (0.299%) in intersection
    featureBits canFam1 \!broadRmskBed4.bed rmsk -bed=notInBroad.bed
#20674605 bases of 2359845093 (0.876%) in intersection
    featureBits canFam1 broadRmskBed4.bed \!rmsk \!simpleRepeat \
      -bed=notInRmskTRF.bed
#6906446 bases of 2359845093 (0.293%) in intersection
    # Load theirs up as "rmskNew" since hgTracks has a registered handler 
    # for that... easier visual comparison.  
    hgLoadOut -table=rmskNew canFam1 dog1.0_repeatinfo/chr*.fa.out

    # back on kksilo
    wc -l notInRmsk.bed 
# 256372 notInRmsk.bed
    awk '$3-$2 >= 40 {printf "%s\t%d\t%d\t%s\t%d\n", $1, $2, $3, $4, $3-$2;}' \
      notInRmsk.bed > notInRmskOver40.bed
    awk '$3-$2 >= 40 {printf "%s\t%d\t%d\t%s\t%d\n", $1, $2, $3, $4, $3-$2;}' \
      notInBroad.bed > notInBroadOver40.bed
    # Take a look at some of those positions in the browser.
    # Here's one we just miss:    chr1:4,524,717-4,524,907
    # Here's one they just miss:  chr1:3,503,297-3,503,495
    # Broad folks agreed the diffs are very small (sometimes due to the fact 
    # that they ran on contigs while we ran on ~500k chunks).


# SIMPLE REPEATS (TRF) (DONE 7/7/04 angie)
    ssh kksilo
    mkdir /cluster/data/canFam1/bed/simpleRepeat
    cd /cluster/data/canFam1/bed/simpleRepeat
    mkdir trf
    cp /dev/null jobs.csh
    foreach d (/cluster/data/canFam1/*/chr*_?{,?})
      set ctg = $d:t
      foreach f ($d/${ctg}.fa)
        set fout = $f:t:r.bed
        echo $fout
        echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
        >> jobs.csh
      end
    end
    csh -ef jobs.csh >&! jobs.log &
    # check on this with
    tail -f jobs.log
    wc -l jobs.csh
    ls -1 trf | wc -l
    endsInLf trf/*
    # When job is done do:
    liftUp simpleRepeat.bed /cluster/data/canFam1/jkStuff/liftAll.lft warn \
      trf/*.bed

    # Load into the database:
    ssh hgwdev
    hgLoadBed canFam1 simpleRepeat \
      /cluster/data/canFam1/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
    featureBits canFam1 simpleRepeat
#36509895 bases of 2359845093 (1.547%) in intersection


# PROCESS SIMPLE REPEATS INTO MASK (DONE 7/7/04/ angie)
    # After the simpleRepeats track has been built, make a filtered version 
    # of the trf output: keep trf's with period <= 12:
    ssh kksilo
    cd /cluster/data/canFam1/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/chr*.bed)
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
    # Lift up filtered trf output to chrom coords as well:
    cd /cluster/data/canFam1
    mkdir bed/simpleRepeat/trfMaskChrom
    foreach c (`cat chrom.lst`)
      if (-e $c/lift/ordered.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
          $c/lift/ordered.lst > $c/lift/oTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
      endif
    end
    # Here's the coverage for the filtered TRF:
    ssh hgwdev
    cat /cluster/data/canFam1/bed/simpleRepeat/trfMaskChrom/*.bed \
      > /tmp/filtTrf.bed
    featureBits canFam1 /tmp/filtTrf.bed
#23017541 bases of 2359845093 (0.975%) in intersection
    featureBits canFam1 /tmp/filtTrf.bed \!rmsk
#1275941 bases of 2359845093 (0.054%) in intersection


# MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 7/7/04 angie)
    ssh kksilo
    cd /cluster/data/canFam1
    # Soft-mask (lower-case) the contig and chr .fa's, 
    # then make hard-masked versions from the soft-masked.  
    set trfCtg=bed/simpleRepeat/trfMask
    set trfChr=bed/simpleRepeat/trfMaskChrom
    foreach f (*/chr*.fa)
      echo "repeat- and trf-masking $f"
      maskOutFa -soft $f $f.out $f
      set chr = $f:t:r
      maskOutFa -softAdd $f $trfChr/$chr.bed $f
      echo "hard-masking $f"
      maskOutFa $f hard $f.masked
    end
    # Tons of warnings like this, mostly for L1M*:
#WARNING: negative rEnd: -189 chrX:117586389-117586475 L1M3e
    foreach c (`cat chrom.lst`)
      echo "repeat- and trf-masking contigs of chr$c"
      foreach d ($c/chr*_?{,?})
        set ctg=$d:t
        set f=$d/$ctg.fa
        maskOutFa -soft $f $f.out $f
        maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
        maskOutFa $f hard $f.masked
      end
    end
    #- Rebuild the nib files, using the soft masking in the fa:
    foreach f (*/chr*.fa)
      faToNib -softMask $f nib/$f:t:r.nib
    end
    # Make one big 2bit file as well, and make a link to it in 
    # /gbdb/canFam1/nib because hgBlat looks there:
    faToTwoBit */chr*.fa canFam1.2bit
    ssh hgwdev
    ln -s /cluster/data/canFam1/canFam1.2bit /gbdb/canFam1/nib/


# MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR CANFAM1 (DONE 7/13/04 angie)
    ssh hgwdev
    echo 'insert into blatServers values("canFam1", "blat9", "17778", 1, 0); \
          insert into blatServers values("canFam1", "blat9", "17779", 0, 1);' \
      | hgsql -h genome-testdb hgcentraltest


# MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE 7/6/04 angie)
    ssh hgwdev
    mkdir /gbdb/canFam1/html
    # Write ~/kent/src/hg/makeDb/trackDb/dog/canFam1/description.html 
    # with a description of the assembly and some sample position queries.  
    chmod a+r $HOME/kent/src/hg/makeDb/trackDb/dog/canFam1/description.html
    # Check it in and copy (ideally using "make alpha" in trackDb) to 
    # /gbdb/canFam1/html


# PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE 7/7/04 angie)
    ssh kkr1u00
    # Chrom-level mixed nibs that have been repeat- and trf-masked:
    rm -rf /iscratch/i/canFam1/nib
    mkdir /iscratch/i/canFam1/nib
    cp -p /cluster/data/canFam1/nib/chr*.nib /iscratch/i/canFam1/nib
    # Pseudo-contig fa that have been repeat- and trf-masked:
    rm -rf /iscratch/i/canFam1/maskedContigs
    mkdir /iscratch/i/canFam1/maskedContigs
    foreach d (/cluster/data/canFam1/*/chr*_?{,?})
      cp $d/$d:t.fa /iscratch/i/canFam1/maskedContigs
    end
    cp -p /cluster/data/canFam1/canFam1.2bit /iscratch/i/canFam1/
    du -sh /iscratch/i/canFam1/*
#631M    /iscratch/i/canFam1/canFam1.2bit
#2.5G    /iscratch/i/canFam1/maskedContigs
#1.2G    /iscratch/i/canFam1/nib
    iSync
    # Put nibs and rmsk files in /scratch for big cluster blastz.
    mkdir /cluster/bluearc/scratch/hg/canFam1
    cp -pR /iscratch/i/canFam1/nib /cluster/bluearc/scratch/hg/canFam1
    ssh kksilo
    mkdir /cluster/bluearc/scratch/hg/canFam1/rmsk
    cp -p /cluster/data/canFam1/*/chr*.fa.out \
       /cluster/bluearc/scratch/hg/canFam1/rmsk
    # Ask cluster-admin for an rsync after the next step.  


# MAKE LINEAGE-SPECIFIC REPEATS VS. HUMAN, MOUSE (DONE 7/7/04 angie)
    ssh kksilo
    cd /cluster/bluearc/scratch/hg/canFam1/rmsk
    # Run Arian's DateRepsinRMoutput.pl to add extra columns telling 
    # whether repeats in -query are also expected in -comp species.  
    # Human in extra column 1, Mouse in extra column 2
    foreach outfl ( *.out )
        echo "$outfl"
        /cluster/bluearc/RepeatMasker/DateRepsinRMoutput.pl \
          ${outfl} -query dog -comp human -comp mouse
    end
    # Now extract human (extra column 1), mouse (extra column).
    cd /cluster/bluearc/scratch/hg/canFam1
    mkdir linSpecRep.notInHuman
    mkdir linSpecRep.notInMouse
    foreach f (rmsk/*.out_hum_mus)
        set base = $f:t:r:r
        echo $base.out.spec
        /cluster/bin/scripts/extractLinSpecReps 1 $f > \
                        linSpecRep.notInHuman/$base.out.spec
        /cluster/bin/scripts/extractLinSpecReps 2 $f > \
                        linSpecRep.notInMouse/$base.out.spec
    end
    wc -l rmsk/*.out
#4337285 total
    wc -l linSpecRep.notInHuman/*
#1333452 total
    wc -l linSpecRep.notInMouse/*
#1333452 total
    # The linSpecReps.notIn{Human,Mouse} files are the same.  I asked Arian 
    # if that's a bad thing last week (human vs. {dog,mouse} -> same).  
    # Haven't heard back yet, but hey, we're all mammals here...
    foreach f (linSpecRep.notInHuman/*)
      echo $f:t
      diff $f linSpecRep.notInMouse/$f:t | wc -l | g -vw 0
    end
    # Clean up.
    rm /cluster/bluearc/scratch/hg/canFam1/rmsk/*.out_hum_mus
    # Ask cluster-admin for an rsync.


# MAKE 10.OOC, 11.OOC FILES FOR BLAT (DONE 7/7/04 angie)
    # Use -repMatch=843 (based on size -- for human we use 1024, and 
    # dog size is 2359845093 / 2866216770 = ~82.3% of human judging by 
    # gapless genome size from featureBits)
    ssh kolossus
    mkdir /cluster/data/canFam1/bed/ooc
    cd /cluster/data/canFam1/bed/ooc
    ls -1 /cluster/data/canFam1/nib/chr*.nib > nib.lst
    blat nib.lst /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/canFam1/11.ooc -repMatch=843
#Wrote 26268 overused 11-mers to /cluster/bluearc/canFam1/11.ooc
    blat nib.lst /dev/null /dev/null -tileSize=10 \
      -makeOoc=/cluster/bluearc/canFam1/10.ooc -repMatch=843
#Wrote 143003 overused 10-mers to /cluster/bluearc/canFam1/10.ooc
    ssh kkr1u00
    cp -p /cluster/bluearc/canFam1/*.ooc /iscratch/i/canFam1/
    iSync


# AUTO UPDATE GENBANK MRNA RUN  (DONE 7/8/04 angie)
    # 7/22/04: added xenoRefGene
    # 2/22/06: added refGene
    ssh hgwdev
    # Update genbank config and source in CVS:
    cd ~/kent/src/hg/makeDb/genbank
    cvsup .
    # See if /cluster/data/genbank/etc/genbank.conf has had any un-checked-in
    # edits, check them in if necessary:
    diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf

    # Edit etc/genbank.conf and add these lines:
# canFam1 (dog)
canFam1.genome = /iscratch/i/canFam1/nib/chr*.nib
canFam1.lift = /cluster/data/canFam1/jkStuff/liftAll.lft
canFam1.refseq.mrna.native.load = no
canFam1.refseq.mrna.xeno.load = yes
canFam1.refseq.mrna.xeno.pslReps = -minCover=0.15 -minAli=0.75 -nearTop=0.005
canFam1.genbank.mrna.xeno.load = yes
canFam1.genbank.est.xeno.load = no
canFam1.downloadDir = canFam1

    cvs ci etc/genbank.conf
    # Since dog is a new species for us, edit src/lib/gbGenome.c.  
    # Pick some other browser species, & monkey-see monkey-do.  
    cvs diff src/lib/gbGenome.c
    make
    cvs ci src/lib/gbGenome.c

    # Edit src/align/gbBlat to add /iscratch/i/canFam1/11.ooc
    cvs diff src/align/gbBlat
    make
    cvs ci src/align/gbBlat
    # Install to /cluster/data/genbank:
    make install-server

    ssh eieio
    cd /cluster/data/genbank
    # This is an -initial run, (xeno) refseq only:
    nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial canFam1 &
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad canFam1
    featureBits canFam1 xenoRefGene
#39066569 bases of 2359845093 (1.655%) in intersection
    # Clean up:
    rm -r work/initial.canFam1

    ssh eieio
    cd /cluster/data/genbank
    # This is an -initial run, mRNA only:
    nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial canFam1 &
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad canFam1
    featureBits canFam1 mrna
#1850373 bases of 2359845093 (0.078%) in intersection
    featureBits canFam1 xenoMrna
#62236255 bases of 2359845093 (2.637%) in intersection
    # Clean up:
    rm -r work/initial.canFam1

    ssh eieio
    # -initial for ESTs:
    nice bin/gbAlignStep -srcDb=genbank -type=est -initial canFam1 &
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 canFam1 &
    featureBits canFam1 intronEst
#4090352 bases of 2359845093 (0.173%) in intersection
    featureBits canFam1 est
#10958036 bases of 2359845093 (0.464%) in intersection
    # Clean up:
    rm -r work/initial.canFam1

    # 2/22/06: add native refseq
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    # genbank.conf: canFam1.refseq.mrna.native.load = yes
    make etc-update
    ssh kkstore02
    cd /cluster/data/genbank
    nice bin/gbAlignStep -srcDb=refseq -type=mrna -orgCat=native -initial \
      canFam1 &
    tail -f var/build/logs/2006.02.22-17:54:01.canFam1.initalign.log
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad canFam1 &
    tail -f var/dbload/hgwdev/logs/2006.02.22-18:01:09.dbload.log
    featureBits canFam1 refGene
#1220839 bases of 2359845093 (0.052%) in intersection


# COMPARE COUNTS OF ALIGNED VS NON-ALIGNED MRNAS (DONE 7/8/04 angie)
    ssh hgwdev
    mkdir /cluster/data/canFam1/bed/mrnaCheck
    cd /cluster/data/canFam1/bed/mrnaCheck
    # Get accs of all mRNAs and ESTs in our genbank build.  
    # Note: if/when there's a RefSeq for dog, filter NM_* accessions 
    # out of the mRNA list!
    hgsql -N canFam1 -e 'select acc from mrna where type = "mRNA" and \
                         organism = 2497 or organism = 2654' \
    | sort > accs.mrna
    hgsql -N canFam1 -e 'select acc from mrna where type = "EST" and \
                         organism = 2497 or organism = 2654' \
    | sort > accs.est
    # Get accs of mRNAs and ESTs that were aligned:
    hgsql -N canFam1 -e "select distinct(qName) from  all_mrna" \
    | sort > alignedAccs.mrna
    hgsql -N canFam1 -e "select distinct(qName) from  all_est" \
    | sort > alignedAccs.est
    # Get accs of mRNAs and ESTs that were not aligned:
    diff accs.mrna alignedAccs.mrna | egrep -v '^[0-9]' | sed -e 's/^< //' \
    > notAlignedAccs.mrna
    diff accs.est alignedAccs.est | egrep -v '^[0-9]' | sed -e 's/^< //' \
    > notAlignedAccs.est
    # Look at counts, compute percentages.
    wc -l *
#  37387 accs.est
#   1188 accs.mrna
#  34781 alignedAccs.est
#   1133 alignedAccs.mrna
#   2606 notAlignedAccs.est
#     55 notAlignedAccs.mrna
    expr 37387 - 34781
#2606  (93.0% aligned, 7.0% not aligned) 
    expr 1188 - 1133
#55    (95.4% aligned, 4.6% not aligned)
    # Add descriptions to mRNAs not aligned.  
    cp /dev/null notAlignedDesc.mrna
    foreach acc (`cat notAlignedAccs.mrna`)
      hgsql -N canFam1 \
        -e 'select mrna.acc,description.name from mrna,description \
            where mrna.acc = "'$acc'" and mrna.description = description.id' \
      >> notAlignedDesc.mrna
    end


# MAKE DOWNLOADABLE SEQUENCE FILES (DONE 7/7/04 angie)
    ssh kksilo
    cd /cluster/data/canFam1
    #- Build the .zip files
    cat << '_EOF_' > jkStuff/zipAll.csh
rm -rf zip
mkdir zip
zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp
zip -j zip/chromOut.zip */chr*.fa.out
zip -j zip/chromFa.zip */chr*.fa
zip -j zip/chromFaMasked.zip */chr*.fa.masked
cd bed/simpleRepeat
zip ../../zip/chromTrf.zip trfMaskChrom/chr*.bed
cd ../..
cd /cluster/data/genbank
./bin/i386/gbGetSeqs -db=canFam1 -native GenBank mrna \
        /cluster/data/canFam1/zip/mrna.fa
cd /cluster/data/canFam1/zip
zip -j mrna.zip mrna.fa
'_EOF_'
    # << this line makes emacs coloring happy
    csh ./jkStuff/zipAll.csh |& tee zipAll.log
    cd zip
    #- Look at zipAll.log to make sure all file lists look reasonable.  
    #- Check zip file integrity:
    foreach f (*.zip)
      unzip -t $f > $f.test
      tail -1 $f.test
    end
    wc -l *.zip.test

    #- Copy the .zip files to hgwdev:/usr/local/apache/...
    ssh hgwdev
    cd /cluster/data/canFam1/zip
    set gp = /usr/local/apache/htdocs/goldenPath/canFam1
    mkdir -p $gp/bigZips
    cp -p *.zip $gp/bigZips
    mkdir -p $gp/chromosomes
    foreach f ( ../*/chr*.fa )
      zip -j $gp/chromosomes/$f:t.zip $f
    end

    cd $gp/bigZips
    md5sum *.zip > md5sum.txt
    cd $gp/chromosomes
    md5sum *.zip > md5sum.txt
    # Take a look at bigZips/* and chromosomes/*, update their README.txt's
    # Can't make refGene upstream sequence files - no refSeq for dog.
    # Maybe ensGene when we get that.


# SWAP BLASTZ HUMAN-DOG TO DOG-HUMAN (HG17) (DONE 7/9/04 angie)
    ssh kolossus
    mkdir /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08
    ln -s blastz.hg17.swap.2004-07-08 /cluster/data/canFam1/bed/blastz.hg17
    cd /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08
    set aliDir = /cluster/data/hg17/bed/blastz.canFam1.2004-07-08
    cp $aliDir/S1.len S2.len
    cp $aliDir/S2.len S1.len
    mkdir unsorted axtChrom
    cat $aliDir/axtChrom/chr*.axt \
    | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
    | axtSplitByTarget stdin unsorted
    # Sort the shuffled .axt files.
    foreach f (unsorted/*.axt)
      echo sorting $f:t:r
      axtSort $f axtChrom/$f:t
    end
    du -sh $aliDir/axtChrom unsorted axtChrom
    rm -r unsorted


# CHAIN HUMAN BLASTZ (DONE 7/10/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 \
  /iscratch/i/canFam1/nib \
  /iscratch/i/gs.18/build35/bothMaskedNibs $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
    # Wow, chr1 dwarfs the others...
#Completed: 41 of 41 jobs
#Average job time:                 293s       4.88m     0.08h    0.00d
#Longest job:                     4419s      73.65m     1.23h    0.05d
#Submission to last job:          4419s      73.65m     1.23h    0.05d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=4000 /tmp/score.$f:t:r
      echo ""
    end

    # Lots of chaff with scores in the 3000's.  Many very-high-scoring 
    # chains.  So filter the chain down somewhat...
    mv all.chain all.chain.unfiltered
    chainFilter -minScore=5000 all.chain.unfiltered > all.chain
    rm chain/*
    chainSplit chain all.chain
    gzip all.chain.unfiltered

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain canFam1 ${c}_chainHg17 $i
    end
    # Wow, very high coverage!
    featureBits -chrom=chr1 canFam1 chainHg17Link
#Out of memory needMem - request size 75 bytes
    featureBits -chrom=chr2 canFam1 chainHg17Link
#55072794 bases of 83844569 (65.684%) in intersection


# NET HUMAN BLASTZ (DONE 7/10/04 angie)
    ssh kksilo
    cd /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08/axtChain
    netClass noClass.net canFam1 hg17 human.net

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08/axtChain
    rm noClass.net
    # Make a 'syntenic' subset of these with
    netFilter -syn human.net > humanSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08/axtChain
    netFilter -minGap=10 human.net |  hgLoadNet canFam1 netHg17 stdin
    netFilter -minGap=10 humanSyn.net | hgLoadNet canFam1 netSyntenyHg17 stdin
    # Add entries for chainHg17, netHg17, syntenyHg17 to dog/canFam1 trackDb


# GENERATE HG17 MAF FOR MULTIZ FROM NET (DONE 7/19/04 angie)
    ssh kksilo
    cd /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08/axtChain
    netSplit human.net net
    cd /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08
    mkdir axtNet
    foreach f (axtChain/net/*)
      set chr = $f:t:r
      netToAxt $f axtChain/chain/$chr.chain /cluster/data/canFam1/nib \
        /cluster/data/hg17/nib stdout \
      | axtSort stdin axtNet/$chr.axt
    end
    mkdir mafNet
    foreach f (axtNet/chr*.axt)
      set maf = mafNet/$f:t:r.maf
      axtToMaf $f \
            /cluster/data/canFam1/chrom.sizes /cluster/data/hg17/chrom.sizes \
            $maf -tPrefix=canFam1. -qPrefix=hg17.
    end


# MAKE VSHG17 DOWNLOADABLES (DONE 7/27/04 angie)
    ssh kksilo
    cd /cluster/data/canFam1/bed/blastz.hg17/axtChain
    gzip -c all.chain > /cluster/data/canFam1/zip/human.chain.gz
    gzip -c human.net > /cluster/data/canFam1/zip/human.net.gz
    # Added 9/15/04:
    mkdir /cluster/data/canFam1/zip/axtNet
    foreach f (../axtNet/chr*.axt)
      gzip -c $f > /cluster/data/canFam1/zip/axtNet/$f:t.gz
    end
    
    # Make chain-format of raw alignments -- special request by Broad
    cd /cluster/data/canFam1/bed/blastz.hg17
    mkdir blastzECF
    foreach f (axtChrom/chr*.axt.gz)
      set chr = $f:t:r:r
      gunzip -c $f \
      | axtToChain stdin S1.len S2.len stdout \
      | gzip -c - > blastzECF/$chr.ecf.gz
    end

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/canFam1/vsHg17
    cd /usr/local/apache/htdocs/goldenPath/canFam1/vsHg17
    mv /cluster/data/canFam1/zip/human*.gz .
    mv /cluster/data/canFam1/zip/axtNet .
    md5sum *.gz */*.gz > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.
    # Not for pushing -- handle separately.
    mv /cluster/data/canFam1/bed/blastz.hg17/blastzECF .
    cd blastzECF
    md5sum *.gz > md5sum.txt


# SWAP BLASTZ MOUSE-DOG TO DOG-MOUSE (MM5) (DONE 7/19/04 angie)
    ssh kolossus
    mkdir /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15
    ln -s blastz.mm5.swap.2004-07-15 /cluster/data/canFam1/bed/blastz.mm5
    cd /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15
    set aliDir = /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
    cp $aliDir/S1.len S2.len
    cp $aliDir/S2.len S1.len
    mkdir unsorted axtChrom
    cat $aliDir/axtChrom/chr*.axt \
    | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
    | axtSplitByTarget stdin unsorted
    # Sort the shuffled .axt files.
    foreach f (unsorted/*.axt)
      echo sorting $f:t:r
      axtSort $f axtChrom/$f:t
    end
    du -sh $aliDir/axtChrom unsorted axtChrom
    rm -r unsorted


# CHAIN MOUSE BLASTZ (DONE 7/19/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 \
  /iscratch/i/canFam1/nib \
  /iscratch/i/mus/mm5/softNib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 41 of 41 jobs
#Average job time:                 211s       3.52m     0.06h    0.00d
#Longest job:                      488s       8.13m     0.14h    0.01d
#Submission to last job:           579s       9.65m     0.16h    0.01d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=5000 /tmp/score.$f:t:r
      echo ""
    end

    # Lots of chaff with scores in the 3000's.  Many very-high-scoring 
    # chains.  So filter the chain down somewhat...
    mv all.chain all.chain.unfiltered
    chainFilter -minScore=5000 all.chain.unfiltered > all.chain
    rm chain/*
    chainSplit chain all.chain
    gzip all.chain.unfiltered

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain canFam1 ${c}_chainMm5 $i
    end
    # Similar to human-mouse coverage -- not nearly as high as dog-human.
    featureBits -chrom=chr1 canFam1 chainMm5Link
#40430697 bases of 120715446 (33.493%) in intersection
    featureBits -chrom=chr2 canFam1 chainMm5Link
#31744742 bases of 83844569 (37.861%) in intersection


# NET MOUSE BLASTZ (DONE 7/19/04 angie)
    ssh kolossus
    cd /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15/axtChain
    netClass -noAr noClass.net canFam1 mm5 mouse.net

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15/axtChain
    rm noClass.net
    # Make a 'syntenic' subset of these with
    netFilter -syn mouse.net > mouseSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15/axtChain
    netFilter -minGap=10 mouse.net |  hgLoadNet canFam1 netMm5 stdin
    netFilter -minGap=10 mouseSyn.net | hgLoadNet canFam1 netSyntenyMm5 stdin
    # Add entries for chainMm5, netMm5, syntenyMm5 to dog/canFam1 trackDb


# GENERATE MM5 MAF FOR MULTIZ FROM NET (DONE 7/19/04 angie)
    ssh kksilo
    cd /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15/axtChain
    netSplit mouse.net net
    cd /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15
    mkdir axtNet
    foreach f (axtChain/net/*)
      set chr = $f:t:r
      netToAxt $f axtChain/chain/$chr.chain /cluster/data/canFam1/nib \
        /cluster/data/mm5/nib stdout \
      | axtSort stdin axtNet/$chr.axt
    end
    mkdir mafNet
    foreach f (axtNet/chr*.axt)
      set maf = mafNet/$f:t:r.maf
      axtToMaf $f \
            /cluster/data/canFam1/chrom.sizes /cluster/data/mm5/chrom.sizes \
            $maf -tPrefix=canFam1. -qPrefix=mm5.
    end


# MAKE VSMM5 DOWNLOADABLES (DONE 7/28/04 angie)
    ssh kksilo
    cd /cluster/data/canFam1/bed/blastz.mm5/axtChain
    gzip -c all.chain > /cluster/data/canFam1/zip/mouse.chain.gz
    gzip -c mouse.net > /cluster/data/canFam1/zip/mouse.net.gz
    # Added 9/15/04:
    mkdir /cluster/data/canFam1/zip/axtNet
    foreach f (../axtNet/chr*.axt)
      gzip -c $f > /cluster/data/canFam1/zip/axtNet/$f:t.gz
    end
    # Make chain-format of raw alignments -- special request by Broad
    cd /cluster/data/canFam1/bed/blastz.mm5
    mkdir blastzECF
    foreach f (axtChrom/chr*.axt)
      set chr = $f:t:r
      axtToChain $f S1.len S2.len stdout \
      | gzip -c - > blastzECF/$chr.ecf.gz
    end

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/canFam1/vsMm5
    cd /usr/local/apache/htdocs/goldenPath/canFam1/vsMm5
    mv /cluster/data/canFam1/zip/mouse*.gz .
    mv /cluster/data/canFam1/zip/axtNet .
    md5sum *.gz > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.
    # Not for pushing -- handle separately.
    mv /cluster/data/canFam1/bed/blastz.mm5/blastzECF .
    cd blastzECF
    md5sum *.gz > md5sum.txt


# MULTIZ DOG/HUMAN/MOUSE (DONE 7/19/04 angie)
    # put the MAFs on bluearc
    ssh kksilo
    mkdir -p /cluster/bluearc/multiz.dog/ch
    cp /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08/mafNet/*.maf \
      /cluster/bluearc/multiz.dog/ch
    mkdir -p /cluster/bluearc/multiz.dog/cm
    cp /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15/mafNet/*.maf \
      /cluster/bluearc/multiz.dog/cm

    ssh kki
    mkdir /cluster/data/canFam1/bed/multiz.canFam1hg17mm5
    cd /cluster/data/canFam1/bed/multiz.canFam1hg17mm5
    mkdir chm
    # Wrapper script required because of stdout redirect:
    cat << '_EOF_' > doMultiz
#!/bin/csh
/cluster/bin/penn/multiz $1 $2 - > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doMultiz
    rm -f jobList
    foreach file (/cluster/bluearc/multiz.dog/ch/*.maf) 
      set root=$file:t:r:r
      echo "doMultiz {check in line+ /cluster/bluearc/multiz.dog/cm/${root}.maf} {check in line+ $file} {check out line+ /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/chm/${root}.maf}" >> jobList
    end
    para create jobList
    para try, check, push, check
#Completed: 41 of 41 jobs
#Average job time:                 165s       2.75m     0.05h    0.00d
#Longest job:                      343s       5.72m     0.10h    0.00d
#Submission to last job:           569s       9.48m     0.16h    0.01d

    # clean up bluearc (these are big files!)
    rm -r /cluster/bluearc/multiz.dog

    # setup external files for database reference
    ssh hgwdev
    mkdir -p /gbdb/canFam1/mzHg17Mm5_phast
    ln -s /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/chm/*.maf \
      /gbdb/canFam1/mzHg17Mm5_phast
    # load into database
    hgLoadMaf -warn canFam1 mzHg17Mm5_phast
    # load pairwise mafs for wigMaf track
    cd /gbdb/canFam1
    mkdir -p human_chm mouse_chm
    cd /tmp
    ln -s /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08/mafNet/*.maf \
      /gbdb/canFam1/human_chm
    hgLoadMaf -WARN canFam1 human_chm
    ln -s /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15/mafNet/*.maf \
      /gbdb/canFam1/mouse_chm
    hgLoadMaf -WARN canFam1 mouse_chm

# MAKE MM5 LIFTOVER CHAIN FILE (DONE 1/17/05 Andy)
    # This took over an hour.  Maybe it's that slow no matter what if 
    # the big chain isn't split up first.
    ssh kolossus
    cd /cluster/data/canFam1/bed/blastz.mm5/axtChain
    mkdir over
    netSplit mouse.net net
    mkdir /cluster/data/canFam1/bed/liftOver
    for file in net/*.net; do
       chrom=`basename $file .net`
       netChainSubset net/$chrom.net all.chain.gz over/$chrom.over
       cat over/$chrom.over >> /cluster/data/canFam1/bed/liftOver/canFam1ToMm5.chain
    done
    rm -rf over/
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/canFam1/liftOver
    cd /usr/local/apache/htdocs/goldenPath/canFam1/liftOver
    cp /cluster/data/canFam1/bed/liftOver/canFam1ToMm5.chain .
    gzip canFam1ToMm5.chain
    mkdir -p /gbdb/canFam1/liftOver
    ln -s /cluster/data/canFam1/bed/liftOver/canFam1ToMm5.chain /gbdb/canFam1/liftOver/canFam1ToMm5.over.chain
    hgAddLiftOverChain -multiple canFam1 mm5

# MAKE HG17 LIFTOVER CHAIN FILE (DONE 1/17/05 Andy)
    # Maybe I can do without the net splitting this time... 
    # nope, ran out of memory.  I split up the all.chain file as another step.  It
    # took 1.5 hrs to run.
    ssh kolossus
    cd /cluster/data/canFam1/bed/blastz.hg17/axtChain
    mkdir over
    netSplit human.net net
    mkdir chain
    for file in net/*.net; do
       chrom=`basename $file .net`
       chainFilter -t=$chrom all.chain.gz > chain/$chrom.chain
       netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over
       cat over/$chrom.over >> /cluster/data/canFam1/bed/liftOver/canFam1ToHg17.chain
    done
    rm -rf over/ net/
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/canFam1/liftOver
    cp /cluster/data/canFam1/bed/liftOver/canFam1ToHg17.chain .
    gzip canFam1ToHg17.chain
    ln -s /cluster/data/canFam1/bed/liftOver/canFam1ToHg17.chain /gbdb/canFam1/liftOver/canFam1ToHg17.over.chain
    hgAddLiftOverChain -multiple canFam1 hg17

# PHASTCONS DOG/HUMAN/MOUSE (DONE 7/21/04 angie)
# NOTE: REPLACED, SEE "NEW PHASTCONS" BELOW
    # step 1: fit phylogenetic model, with rate variation (discrete
    # gamma model) first we need to extract the sufficient stats for
    # the phylogenetic model from the MAFs.  In this case, we don't
    # care about the order of the tuples (allows for much more compact
    # representation)
    ssh kksilo
    mkdir /cluster/data/canFam1/bed/phastCons
    cd /cluster/data/canFam1/bed/phastCons
    mkdir unordered
    foreach f (/cluster/data/canFam1/bed/multiz.canFam1hg17mm5/chm/chr*.maf)
      /cluster/bin/phast/msa_view $f --in-format MAF --out-format SS \
            --order canFam1,hg17,mm5 \
            --unordered-ss  > unordered/$f:t:r.ss
    end
    # aggregate for whole genome
    ls unordered/chr*.ss > fnames
    /cluster/bin/phast/msa_view "*fnames" -i SS \
      --aggregate canFam1,hg17,mm5 -o SS --unordered-ss \
      > all.ss
    # estimate parameters; should be very fast in this case
    /cluster/bin/phast/phyloFit --msa all.ss --tree "((1,2),3)" \
      --msa-format SS --subst-mod REV --nrates 40 --out-root rev-dg \
      --output-tree
    # make sure model looks reasonable
    cat rev-dg.mod
#ALPHABET: A C G T 
#ORDER: 0
#SUBST_MOD: REV
#NRATECATS: 40
#ALPHA: 4.506486
#TRAINING_LNL: -4356689945.768768
#BACKGROUND: 0.297650 0.202241 0.202273 0.297836 
#RATE_MAT:
#  -0.856784    0.157834    0.554732    0.144219 
#   0.232292   -1.211122    0.162600    0.816230 
#   0.816302    0.162574   -1.211329    0.232452 
#   0.144128    0.554249    0.157868   -0.856245 
#TREE: ((1:0.176986,2:0.142776):0.170935,3:0.170935);

    # beware of zero branch lengths (indicates bad topology) or very
    # large branch lengths (much greater than one; indicates
    # convergence problems).  Can try without --nrates or with
    # --subst-mod HKY85 to double check (parameter estimates should be
    # in the same ballpark, REV + dG likelihood should be a bit
    # better).  Also can write a log file to monitor convergence
    # (--log).  Also watch out for alpha much less than one.
    # Sometimes it's helpful to produce a rendering of the tree using
    # draw_tree (run on the *.nh file)

    # Partition the alignments into bite-sized chunks for cluster phastCons 
    # run.  This time we need the ordered version of the SS format.
    set WINSIZE=1000000
    set WINOVERLAP=0
    mkdir SS
    foreach f (/cluster/data/canFam1/bed/multiz.canFam1hg17mm5/chm/*.maf)
      set chr=$f:t:r
      set c=`echo $chr | sed 's/chr//'`
      echo $f $chr $c
      mkdir SS/$c
      /cluster/bin/phast/msa_split $f -i MAF -o SS \
        -O canFam1,hg17,mm5 -M /cluster/data/canFam1/$c/$chr.fa \
        -w ${WINSIZE},${WINOVERLAP} -r SS/$c/$chr -I 1000 -d 1 -B 5000
    end
    # save some space and I/O time:
    gzip SS/*/*.ss

    # step 2: run phastCons.  
    # Since we haven't run with these particular organisms before, 
    # do a whole-genome run without --transitions so that phastCons does a
    # max likelihood estimate of the parameters.  Some jobs may have trouble 
    # converging [not this time] ==> monitor log files of stragglers:
    # Warning signs include likelihoods not decreasing monotonically, 
    # very fast convergence (e.g., only 3 or 4 iterations), 
    # wildly different estimates from different windows.
    ssh kk
    cd /cluster/data/canFam1/bed/phastCons
    cat << '_EOF_' > doMLE
#!/bin/csh -fe

set PHAST=/cluster/bin/phast
set TMP=/scratch/phastCons/chm

set file=$1
set root = $file:t:r:r
set chrom = $root:r

mkdir -p $TMP
cp $file $TMP/$root.ss.gz
zcat $TMP/$root.ss.gz \
| $PHAST/phastCons - rev-dg.mod --nrates 40 \
  --quiet --log ${TMP}/$root.log --no-post-probs --quiet
mkdir -p MLE/$chrom
set lc = `wc -l < $TMP/$root.log`
set pq = `tail -1 $TMP/$root.log | awk '{print $3 " " $4;}'`
echo "$pq $lc" > MLE/$chrom/$root.pqi
rm $TMP/$root.log
rm $TMP/$root.ss.gz
'_EOF_'
    # << this line makes emacs coloring happy
    chmod 775 doMLE
    rm -f jobs.lst
    foreach f (`ls -1S SS/*/*.ss.gz`)
      echo './doMLE {check in exists+ '$f'}' \
      >> jobs.lst
    end
    para create jobs.lst 
    para try ; para push ; etc....
#Completed: 2427 of 2427 jobs
#Average job time:                 190s       3.17m     0.05h    0.00d
#Longest job:                     1256s      20.93m     0.35h    0.01d
#Submission to last job:          1995s      33.25m     0.55h    0.02d

    # back on kksilo:
    cat MLE/*/*.pqi > all.pqi
    # Look for ones that converged too fast (one of Adam's warning signs):
    awk '{print $3;}' all.pqi | sort -n | head -1
#5
    awk '{print $3;}' all.pqi | sort -n \
    | textHistogram stdin -binSize=5 -maxBinCount=30
    # Find median p and q (2427 chunks / 2 = 1213th value):
    set nChunks = `wc -l < all.pqi`
    set n50 = `expr \( $nChunks / 2 \) + 1`
    awk '{print $1;}' all.pqi | sort -n | tail +$n50 | head -1
#0.016533
    awk '{print $2;}' all.pqi | sort -n | tail +$n50 | head -1
#0.000991
    # Eyeball the histogram of p values: should peak close to the median:
    awk '{print $1 * 1000;}' all.pqi | textHistogram stdin -maxBinCount=100

    # Now run phastCons again to calcluate postprobs and conserved 
    # elements using the calculated p and q in --transitions.
    # Note -- with the MLE calc'd p and q, the wiggle looked "oversmoothed" 
    # so now we're rerunning with the same p & q used for human 8-way:
    ssh kk
    cd /cluster/data/canFam1/bed/phastCons
    cat << '_EOF_' > doPostProbsEls
#!/bin/csh -fe

set PHAST=/cluster/bin/phast
set TMP=/scratch/phastCons/chm

set file=$1
set root = $file:t:r:r
set chrom = $root:r

mkdir -p $TMP
mkdir -p PREDICTIONS/$chrom
cp $file $TMP/$root.ss.gz
zcat $TMP/$root.ss.gz \
| $PHAST/phastCons - rev-dg.mod --nrates 40 \
  --quiet --viterbi PREDICTIONS/$chrom/$root.bed --score --seqname $chrom \
  --transitions 0.08,0.008 > $TMP/$root.pp
mkdir -p POSTPROBS/$chrom
gzip -c $TMP/$root.pp > POSTPROBS/$chrom/$root.pp.gz
rm $TMP/$root.pp
rm $TMP/$root.ss.gz
'_EOF_'
    # << this line makes emacs coloring happy
    chmod 775 doPostProbsEls
    rm -f jobs.lst
    foreach f (`ls -1S SS/*/*.ss.gz`)
      echo './doPostProbsEls {check in exists+ '$f'}' \
      >> jobs.lst
    end
    para create jobs.lst 
    para try ; para push ; etc....
    # for p=0.080 run... cluster was really hammering kksilo (usually faster):
#Completed: 2427 of 2427 jobs
#Average job time:                 414s       6.90m     0.11h    0.00d
#Longest job:                     2436s      40.60m     0.68h    0.03d
#Submission to last job:          2724s      45.40m     0.76h    0.03d

    # now create wiggle track
    ssh kksilo
    cd /cluster/data/canFam1/bed/phastCons
    mkdir wib
    foreach dir (POSTPROBS/*)
        echo $dir
        set chr=$dir:t
        zcat `ls -1 $dir/*.pp.gz | sort -t\. -k2,2n` | \
            wigAsciiToBinary -chrom=$chr \
            -wibFile=wib/${chr}_phastCons stdin
    end
    ssh hgwdev
    mkdir -p /gbdb/canFam1/wib/mzHg17Mm5_phast
    cd /cluster/data/canFam1/bed/phastCons
    chmod 775 . wib
    chmod 664 wib/*.wib
    ln -s `pwd`/wib/*.wib /gbdb/canFam1/wib/mzHg17Mm5_phast/
    hgLoadWiggle canFam1 mzHg17Mm5_phast_wig \
      -pathPrefix=/gbdb/canFam1/wib/mzHg17Mm5_phast wib/*.wig

    # Create bed track: tweak scores and names, trim the unused strand column.
    cat PREDICTIONS/*/*.bed \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
    | /cluster/bin/scripts/lodToBedScore \
    > all.bed

    hgLoadBed canFam1 phastConsElements all.bed
    # make top-5000 list and launcher on Adam's home page:
    sort -k5,5nr raw.bed | head -5000 > top5000.bed
    /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \
      /cse/grads/acs/public_html/cf-top5000 \
      "top 5000 conserved elements" canFam1


# MAKE DOWNLOADABLE FILES FOR MULTIZ (DONE 7/21/04 angie)
    # make compressed copies of mafs (don't compress originals -- need them 
    # for the browser)
    ssh kksilo
    cd /cluster/data/canFam1/bed/multiz.canFam1hg17mm5
    mkdir gzMaf
    foreach f (chm/*.maf)
      gzip -c $f > gzMaf/$f:t.gz
    end
    cd /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08
    mkdir gzMaf
    foreach f (mafNet/*.maf)
      gzip -c $f > gzMaf/$f:t.gz
    end
    cd /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15
    mkdir gzMaf
    foreach f (mafNet/*.maf)
      gzip -c $f > gzMaf/$f:t.gz
    end
    # put multi and pairwise mafs out for download
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/canFam1/multizHg17Mm5
    cd /usr/local/apache/htdocs/goldenPath/canFam1/multizHg17Mm5
    mv /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/gzMaf/* .
    rmdir /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/gzMaf
    mv /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08/gzMaf hg17
    mv /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15/gzMaf mm5
    md5sum *.gz */*.gz > md5sum.txt
    # make a README.txt file


# PRODUCING GENSCAN PREDICTIONS (DONE 7/9/04 angie)
    ssh hgwdev
    mkdir /cluster/data/canFam1/bed/genscan
    cd /cluster/data/canFam1/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux
    # Run on small cluster (more mem than big cluster).
    ssh kki
    cd /cluster/data/canFam1/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    rm -f genome.list
    touch genome.list
    foreach f ( `ls -1S /cluster/data/canFam1/*/chr*_*/chr*_?{,?}.fa.masked` )
      egrep '[ACGT]' $f > /dev/null
      if ($status == 0) echo $f >> genome.list
    end
    wc -l genome.list
    # Create template file, gsub, for gensub2.  For example (3-line file):
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 genome.list single gsub jobList
    para create jobList
    para try, check, push, check, ...
#Completed: 495 of 497 jobs
#Crashed: 2 jobs
#Average job time:                 803s      13.39m     0.22h    0.01d
#Longest job:                    28087s     468.12m     7.80h    0.33d
#Submission to last job:         34710s     578.50m     9.64h    0.40d
    # If there are crashes, diagnose with "para problems".  
    # If a job crashes due to genscan running out of memory, re-run it 
    # manually with "-window=1200000" instead of "-window=2400000".
    ssh kkr7u00
    cd /cluster/data/canFam1/bed/genscan
    /cluster/bin/x86_64/gsBig /cluster/data/canFam1/5/chr5_17/chr5_17.fa.masked gtf/chr5_17.fa.gtf -trans=pep/chr5_17.fa.pep -subopt=subopt/chr5_17.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
    /cluster/bin/x86_64/gsBig /cluster/data/canFam1/1/chr1_21/chr1_21.fa.masked gtf/chr1_21.fa.gtf -trans=pep/chr1_21.fa.pep -subopt=subopt/chr1_21.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
    ls -1 gtf | wc -l
#    497
    endsInLf gtf/*

    # Convert these to chromosome level files as so:
    ssh kksilo
    cd /cluster/data/canFam1/bed/genscan
    liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
    liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/canFam1/bed/genscan
    # Reloaded without -genePredExt 1/6/05:
    ldHgGene -gtf canFam1 genscan genscan.gtf
    hgPepPred canFam1 generic genscanPep genscan.pep
    hgLoadBed canFam1 genscanSubopt genscanSubopt.bed

# QA NOTE (ASZ 6-11-2007): sudo mytouch canFam1 genscanPep 200501061300.00


# PRODUCING FUGU BLAT ALIGNMENTS (DONE 7/15/04 angie)
    ssh kk
    mkdir /cluster/data/canFam1/bed/blatFr1
    cd /cluster/data/canFam1/bed/blatFr1
    ls -1S /cluster/bluearc/fugu/fr1/trfFa/*.fa > fugu.lst
    ls -1S /iscratch/i/canFam1/maskedContigs/*.fa > dog.lst
    cat << '_EOF_' > gsub
#LOOP
blat -mask=lower -q=dnax -t=dnax {check in exists $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    mkdir psl
    gensub2 dog.lst fugu.lst gsub spec
    para create spec
    para try, check, push, check, ...
#Completed: 3479 of 3479 jobs
#Average job time:                1336s      22.26m     0.37h    0.02d
#Longest job:                     4142s      69.03m     1.15h    0.05d
#Submission to last job:         14450s     240.83m     4.01h    0.17d

    # Sort alignments:
    ssh kksilo
    cd /cluster/data/canFam1/bed/blatFr1
    pslCat -dir psl \
    | liftUp -type=.psl stdout ../../jkStuff/liftAll.lft warn stdin \
    | liftUp -type=.psl -pslQ stdout /cluster/data/fr1/jkStuff/liftAll.lft \
      warn stdin \
    | pslSortAcc nohead chrom /tmp stdin

    # load psl into database:
    ssh hgwdev
    cd /cluster/data/canFam1/bed/blatFr1/chrom
    # Special case: fr1 has only chrUn, so no need to index tName!
    cat *.psl | hgLoadPsl -noTNameIx -fastLoad -table=blatFr1 canFam1 stdin
    featureBits canFam1 blatFr1
#24508362 bases of 2359845093 (1.039%) in intersection
    featureBits hg17 blatFr1
#21977081 bases of 2866216770 (0.767%) in intersection


# LOAD CPGISSLANDS (DONE 7/15/04 angie)
    ssh hgwdev
    mkdir -p /cluster/data/canFam1/bed/cpgIsland
    cd /cluster/data/canFam1/bed/cpgIsland
    # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
    cvs co hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    make
    mv cpglh.exe /cluster/data/canFam1/bed/cpgIsland/
    
    ssh kksilo
    cd /cluster/data/canFam1/bed/cpgIsland
    foreach f (../../*/chr*.fa.masked)
      set fout=$f:t:r:r.cpg
      echo running cpglh on $f to $fout
      ./cpglh.exe $f > $fout
    end
    # Transform cpglh output to bed +
    cat << '_EOF_' > filter.awk
/* Input columns: */
/* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
/* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
/* Output columns: */
/* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
/* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
    # << this line makes emacs coloring happy
    awk -f filter.awk chr*.cpg > cpgIsland.bed

    # load into database:
    ssh hgwdev
    cd /cluster/data/canFam1/bed/cpgIsland
    hgLoadBed canFam1 cpgIslandExt -tab -noBin \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
    wc -l cpgIsland.bed 
#  48070 cpgIsland.bed
    featureBits canFam1 cpgIslandExt
#38799035 bases of 2359845093 (1.644%) in intersection


# BUILD BLAST DATABASES
    cd /cluster/data/canFam1
    mkdir blastDb
     for i in `cat chrom.lst`; do for j in `echo $i/chr$i_*/chr*_*_*.fa`; do ln -s `pwd`/$j blastDb/`basename $j .fa`;
      done; done 
      cd blastDb
      for i in *; do formatdb -p F -i $i; done
# END BLAST


# GC 5 BASE WIGGLE TRACK (DONE 7/7/04 angie)
    # put unmasked nibs in /iscratch for small cluster run.
    ssh kkr1u00
    mkdir /iscratch/i/canFam1/
    cp -pR /cluster/data/canFam1/nib /iscratch/i/canFam1/
    iSync
    ssh kki
    mkdir /cluster/data/canFam1/bed/gc5Base
    cd /cluster/data/canFam1/bed/gc5Base
    mkdir wigData dataLimits
    cat > doGc5Base.csh <<'_EOF_'
#!/bin/csh -fe
set c = $1
set chr = "chr$c"
/cluster/bin/x86_64/hgGcPercent \
  -chr=${chr} -doGaps -file=stdout -win=5 canFam1 ../../nib \
| awk '$4 == "GC" && ($3-$2) >= 5 {printf "%d\t%.1f\n", $2+1, $5/10.0;}' \
| /cluster/bin/x86_64/wigAsciiToBinary \
    -dataSpan=5 -chrom=${chr} -wibFile=wigData/gc5Base_${c} \
    -name=${c} stdin > dataLimits/${chr}
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doGc5Base.csh
    cp /dev/null spec
    foreach c (`cat ../../chrom.lst`)
      echo "./doGc5Base.csh $c" >> spec
    end
    para create spec
    para try, check, push, check, ...
    # ran on kolossus (csh spec >& tee spec.log) because small cluster was 
    # solidly hogged, oh well.
    # data is complete, load track on hgwdev
    ssh hgwdev
    cd /cluster/data/canFam1/bed/gc5Base
    # create symlinks for .wib files, then load track
    mkdir -p /gbdb/canFam1/wib/gc5Base
    ln -s `pwd`/wigData/*.wib /gbdb/canFam1/wib/gc5Base
    hgLoadWiggle -pathPrefix=/gbdb/canFam1/wib/gc5Base \
      canFam1 gc5Base wigData/*.wig


# CREATING QUALITY SCORE TRACK (DONE 7/7/04 angie)
    ssh kolossus
    mkdir /cluster/data/canFam1/bed/qual
    cd /cluster/data/canFam1/bed/qual
    mkdir wigData dataLimits
    foreach agp (../../?{,?}/chr*.agp)
      set chr = $agp:t:r
      set abbrev = `echo $chr | sed -e 's/^chr//;  s/_random/r/;'`
      echo $chr to $abbrev wiggle
      qacToWig ../../broad/fixedQuals.qac -name=$chr stdout \
      | wigAsciiToBinary -dataSpan=1 -chrom=$chr \
          -wibFile=wigData/qual_$abbrev -name=$abbrev stdin \
          > dataLimits/$chr
    end
    # Verify size of .wib file = chrom length
    foreach f (wigData/*.wib)
      set abbrev = $f:t:r
      set chr = `echo $abbrev | sed -e 's/^qual_/chr/;  s/r$/_random/;'`
      set wibLen = `ls -l $f | awk '{print $5;}'`
      set chromLen = `grep -w $chr ../../chrom.sizes | awk '{print $2;}'`
      if ($wibLen != $chromLen) then
        echo "ERROR: $chr size is $chromLen but wib size is $wibLen"
      else
        echo $chr OK.
      endif
    end

    # /gbdb & load:
    ssh hgwdev
    cd /cluster/data/canFam1/bed/qual
    mkdir -p /gbdb/canFam1/wib/qual
    ln -s `pwd`/wigData/*.wib /gbdb/canFam1/wib/qual
    hgLoadWiggle -pathPrefix=/gbdb/canFam1/wib/qual \
      canFam1 quality wigData/*.wig

    # To speed up display for whole chrom views, need to make zoomed
    # data views.  Zoom to 1K points per pixel.
    ssh kolossus
    cd /cluster/data/canFam1/bed/qual
    mkdir -p wigData1K
    mkdir -p dataLimits1K
    foreach c (`cat ../../chrom.lst`)
      if (-f ../../${c}/chr${c}.agp) then
        echo "chr${c} quality 1K zoom"
        qacToWig ../../broad/fixedQuals.qac -name=chr${c} stdout \
        | wigZoom stdin \
        | wigAsciiToBinary -dataSpan=1024 \
                -chrom=chr${c} -wibFile=wigData1K/qc1K_${c} \
                -name=${c} stdin > dataLimits1K/chr${c}
      endif
    end
    ssh hgwdev
    cd /cluster/data/canFam1/bed/qual/wigData1K
    # create symlinks for .wib files
    ln -s `pwd`/*.wib /gbdb/canFam1/wib/qual
    # load in addition to existing data
    hgLoadWiggle -oldTable -pathPrefix=/gbdb/canFam1/wib/qual \
      canFam1 quality *.wig


# ANDY LAW CPGISSLANDS (DONE 7/22/04 angie)
    # See notes in makeGalGal2.doc.
    ssh kksilo
    mkdir /cluster/data/canFam1/bed/cpgIslandGgfAndy
    cd /cluster/data/canFam1/bed/cpgIslandGgfAndy
    # Use masked sequence since this is a mammal...
    cp /dev/null cpgIslandGgfAndyMasked.bed
    foreach f (../../*/chr*.fa.masked)
      set chr = $f:t:r:r
      echo preproc and run on masked $chr
      /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f \
      | /cluster/home/angie/ggf-andy-cpg-island.pl \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndyMasked.bed
    end
    # load into database:
    ssh hgwdev
    cd /cluster/data/canFam1/bed/cpgIslandGgfAndy
    sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql
    hgLoadBed canFam1 cpgIslandGgfAndyMasked -tab -noBin \
      -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed
    featureBits canFam1 cpgIslandExt
#38799035 bases of 2359845093 (1.644%) in intersection
    featureBits canFam1 cpgIslandGgfAndyMasked
#100892068 bases of 2359845093 (4.275%) in intersection
    wc -l ../cpgIsland/cpgIsland.bed *bed
#  48070 ../cpgIsland/cpgIsland.bed
# 143011 cpgIslandGgfAndyMasked.bed
    # 6/28/04 -- masking simpleRepeats, and even repeats other than Alu's,
    # might not be the right thing to do (?).  Give it a try with less-masked 
    # sequence.
    ssh kksilo
    cd /cluster/data/canFam1/bed/cpgIslandGgfAndy
    cp /dev/null cpgIslandGgfAndyOnlyRM.bed
    foreach f (../../*/chr*.fa)
      set chr = $f:t:r
      echo preproc, ggf-andy $chr onlyRM
      maskOutFa $f $f.out stdout \
      | /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \
      | /cluster/home/angie/ggf-andy-cpg-island.pl \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndyOnlyRM.bed
    end
    wc -l cpgIslandGgfAndyOnlyRM.bed
# 142767 cpgIslandGgfAndyOnlyRM.bed
    ssh hgwdev
    cd /cluster/data/canFam1/bed/cpgIslandGgfAndy
    sed -e 's/cpgIslandExt/cpgIslandGgfAndyOnlyRM/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > /tmp/c.sql
    hgLoadBed canFam1 cpgIslandGgfAndyOnlyRM -tab -noBin -sqlTable=/tmp/c.sql \
      cpgIslandGgfAndyOnlyRM.bed
    featureBits canFam1 cpgIslandGgfAndyOnlyRM.bed
#101295282 bases of 2359845093 (4.292%) in intersection
    # OK, just for fun let's run ggf-andy script with TJ parameters!
    ssh kksilo
    cd /cluster/data/canFam1/bed/cpgIslandGgfAndy
    cp /dev/null cpgIslandGgfAndyTJ.bed
    foreach f (../../*/chr*.fa)
      set chr = $f:t:r
      echo running modified w/TJ on $chr
      /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f \
      | /cluster/home/angie/ggf-andy-cpg-island.pl \
         -min-length 500 -percent-gc 55 -obs-over-exp 0.65 \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndyTJ.bed
    end
    wc -l cpgIslandGgfAndyTJ.bed
#  55055 cpgIslandGgfAndyTJ.bed
    # load into database:
    ssh hgwdev
    cd /cluster/data/canFam1/bed/cpgIslandGgfAndy
    # this one is a cpgIslandExt but with a different table name:
    sed -e 's/cpgIslandExt/cpgIslandGgfAndyTJ/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyTJ.sql
    hgLoadBed canFam1 cpgIslandGgfAndyTJ -tab -noBin \
      -sqlTable=cpgIslandGgfAndyTJ.sql cpgIslandGgfAndyTJ.bed
    featureBits canFam1 cpgIslandGgfAndyTJ
#80070786 bases of 2359845093 (3.393%) in intersection
    # Try featureBits -enrichment:
    featureBits -enrichment canFam1 xenoRefGene:upstream:5000 cpgIslandExt
#xenoRefGene:upstream:5000 3.047%, cpgIslandExt 1.644%, both 0.249%, cover 8.16%, enrich 4.96x
    featureBits -enrichment canFam1 xenoRefGene:upstream:5000 cpgIslandGgfAndyMasked
#xenoRefGene:upstream:5000 3.047%, cpgIslandGgfAndyMasked 4.275%, both 0.508%, cover 16.66%, enrich 3.90x
    featureBits -enrichment canFam1 xenoRefGene:upstream:5000 cpgIslandGgfAndyTJ
#xenoRefGene:upstream:5000 3.047%, cpgIslandGgfAndyTJ 3.393%, both 0.570%, cover 18.72%, enrich 5.52x

    # 8/25/05 -- Laura Elnitski requested unmasked.
    cd /cluster/data/canFam1/bed/cpgIslandGgfAndy
    cp /dev/null cpgIslandGgfAndy.bed
    foreach f (../../*/chr*.fa)
      set chr = $f:t:r
      echo preproc and run on unmasked $chr
      preProcGgfAndy $f \
      | /cluster/home/angie/ggf-andy-cpg-island.pl \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndy.bed
    end
    # load into database:
    ssh hgwdev
    cd /cluster/data/canFam1/bed/cpgIslandGgfAndy
    sed -e 's/cpgIslandExt/cpgIslandGgfAndy/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndy.sql
    hgLoadBed canFam1 cpgIslandGgfAndy -tab -noBin \
      -sqlTable=cpgIslandGgfAndy.sql cpgIslandGgfAndy.bed


# NEW PHASTCONS DOG/HUMAN/MOUSE (DONE 9/29/04 angie)
    ssh kksilo
    # copy chrom fa to bluearc, break up the genome-wide MAFs into pieces
    mkdir -p /cluster/bluearc/canFam1/chrom
    cp -p /cluster/data/canFam1/?{,?}/chr*.fa /cluster/bluearc/canFam1/chrom/
    ssh kki
    mkdir /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons
    mkdir /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons/run.split
    cd /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons/run.split
    set WINDOWS = /cluster/bluearc/canFam1/phastCons/WINDOWS
    rm -fr $WINDOWS
    mkdir -p $WINDOWS
    cat << 'EOF' > doSplit.sh
#!/bin/csh -ef

set PHAST=/cluster/bin/phast
set FA_SRC=/cluster/bluearc/canFam1/chrom
set WINDOWS=/cluster/bluearc/canFam1/phastCons/WINDOWS

set maf=$1
set c = $maf:t:r
set tmpDir = /scratch/msa_split/$c
rm -rf $tmpDir
mkdir -p $tmpDir
${PHAST}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O canFam1,hg17,mm5 \
   -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
cd $tmpDir
foreach file ($c.*.ss)
  gzip -c $file > ${WINDOWS}/$file.gz
end
rm -f $tmpDir/$c.*.ss
rmdir $tmpDir
'EOF'
# << for emacs
    chmod a+x doSplit.sh
    rm -f jobList
    foreach file (../../chm/*.maf)
      if (-s $file) then
        echo "doSplit.sh {check in line+ $file}" >> jobList
      endif
    end
    para create jobList
    para try,  check, push, check...
#Completed: 41 of 41 jobs
#Average job time:                 201s       3.36m     0.06h    0.00d
#Longest job:                      392s       6.53m     0.11h    0.00d
#Submission to last job:           720s      12.00m     0.20h    0.01d
    cd ..

    # use the model previously estimated (see above) as a starting model
    /cluster/bin/phast/tree_doctor \
      --rename '1 -> canFam1 ; 2 -> hg17 ; 3 -> mm5' \
      ../../phastCons/rev-dg.mod \
      > starting-tree.mod
#TREE: ((canFam1:0.176986,hg17:0.142776):0.170935,mm5:0.170935);

    # Get genome-wide average GC content (for all species together,
    # not just the reference genome).  If you have a globally
    # estimated tree model, as above, you can get this from the
    # BACKGROUND line in the .mod file.  E.g.,
# ALPHABET: A C G T
# ...
# BACKGROUND: 0.276938 0.223190 0.223142 0.276730
    # add up the C and G:
    awk '$1 == "BACKGROUND:" {printf "%0.3f\n", $3 + $4;}' starting-tree.mod
#0.405
    # Great, use 0.405 as the --gc parameter in phastCons below:.

    # Now set up cluster job to estimate model parameters.  
    # Parameters will be estimated separately for each alignment fragment 
    # then will be combined across fragments.
    # Use --gc from above, --expected-lengths 12 a typical for mammals,
    # and --target-coverage computed as follows:
    # 1. Jim would like phastConsElements coverage to be 5% for mammals.
    # 2. ~66% of canFam1 is covered by chainHg17Link, so use .05 / .66 = .08 
    #    as an initial --target-coverage.
    # 3. If actual coverage is different from our target, come back to this 
    #    step, adjust --target-coverage and rerun up through phastConsElements.
    # OK, .08 landed us a little low, try .10:
#90197498 bases of 2359845093 (3.822%) in intersection
    # .10 still a little low, try .15:
#95675167 bases of 2359845093 (4.054%) in intersection
    # OK, .15 seems close enough:
#110676039 bases of 2359845093 (4.690%) in intersection

    mkdir run.estimate
    cd run.estimate
    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.405 --nrates 1,1 \
    --no-post-probs --ignore-missing --expected-lengths 12 \
    --target-coverage 0.15 --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    chmod a+x doEstimate.sh
    rm -fr LOG TREES
    mkdir -p LOG TREES
    rm -f jobList
    foreach f (/cluster/bluearc/canFam1/phastCons/WINDOWS/*.ss.gz)
      set root = $f:t:r:r
      echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
    end
    # run cluster job
    ssh kk
    cd /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons/run.estimate
    para create jobList
    para try, check, push, check, ...
#Completed: 2427 of 2427 jobs
#Average job time:                  79s       1.31m     0.02h    0.00d
#Longest job:                      146s       2.43m     0.04h    0.00d
#Submission to last job:          1159s      19.32m     0.32h    0.01d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh kksilo
    cd /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons/run.estimate
    ls -1 TREES/*.cons.mod > cons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
      --output-average ave.cons.mod > cons_summary.txt
    ls -1 TREES/*.noncons.mod > noncons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
      --output-average ave.noncons.mod > noncons_summary.txt
    grep TREE ave*.mod
#ave.cons.mod:TREE: ((canFam1:0.052234,hg17:0.043243):0.052171,mm5:0.052171);
#ave.noncons.mod:TREE: ((canFam1:0.196400,hg17:0.162279):0.195831,mm5:0.195831);
    cat cons_summary.txt
    # look over the files cons_summary.txt and noncons_summary.txt.
    # The means and medians should be roughly equal and the stdevs
    # should be reasonably small compared to the means, particularly
    # for rate matrix parameters (at bottom) and for branches to the
    # leaves of the tree.  The stdevs may be fairly high for branches
    # near the root of the tree; that's okay.  Some min values may be
    # 0 for some parameters.  That's okay, but watch out for very large
    # values in the max column, which might skew the mean.  If you see
    # any signs of bad outliers, you may have to track down the
    # responsible .mod files and throw them out.  I've never had to do
    # this; the estimates generally seem pretty well behaved.

    # NOTE: Actually, a random sample of several hundred to a thousand
    # alignment fragments (say, a number equal to the number of
    # available cluster nodes) should be more than adequate for
    # parameter estimation.  If pressed for time, use this strategy.

    # Now we are ready to set up the cluster job for computing the
    # conservation scores and predicted elements.  It's all downhill
    # from here.
    ssh kk
    mkdir /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons/run.phast
    cd /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons/run.phast
    cat << 'EOF' > doPhastCons.sh
#!/bin/csh -ef
set pref = $1:t:r:r
set chr = `echo $pref | awk -F\. '{print $1}'`
set tmpfile = /scratch/phastCons.canFam1.$pref.$$
zcat $1 \
| /cluster/bin/phast/phastCons - \
    ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
    --expected-lengths 12 --target-coverage 0.15 --quiet --seqname $chr \
    --idpref $pref \
    --viterbi /cluster/bluearc/canFam1/phastCons/ELEMENTS/$pref.bed --score \
    --require-informative 0 \
  > $tmpfile
gzip -c $tmpfile > /cluster/bluearc/canFam1/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
'EOF'
# << for emacs
    chmod a+x doPhastCons.sh
    rm -fr /cluster/bluearc/canFam1/phastCons/{POSTPROBS,ELEMENTS}
    mkdir -p /cluster/bluearc/canFam1/phastCons/{POSTPROBS,ELEMENTS}
    rm -f jobList
    foreach f (/cluster/bluearc/canFam1/phastCons/WINDOWS/*.ss.gz)
      echo doPhastCons.sh $f >> jobList
    end
    para create jobList
    para try, check, push, check, ...
#Completed: 2427 of 2427 jobs
#Average job time:                  67s       1.12m     0.02h    0.00d
#Longest job:                      290s       4.83m     0.08h    0.00d
#Submission to last job:           920s      15.33m     0.26h    0.01d

    # back on kksilo:
    # combine predictions and transform scores to be in 0-1000 interval
    # do in a way that avoids limits on numbers of args
    cd /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons
    cp /dev/null all.bed
    foreach chr (`awk '{print $1;}' /cluster/data/canFam1/chrom.sizes`)
      cat /cluster/bluearc/canFam1/phastCons/ELEMENTS/$chr.*.bed \
      | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
      | /cluster/bin/scripts/lodToBedScore >> all.bed
    end

    ssh hgwdev
    cd /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons
    featureBits canFam1 all.bed
#110676039 bases of 2359845093 (4.690%) in intersection

    # OK, close enough.
    hgLoadBed canFam1 phastConsElements all.bed

    # Create wiggle on the small cluster
    ssh kki
    mkdir /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons/run.wib
    cd /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons/run.wib
    rm -rf /cluster/bluearc/canFam1/phastCons/wib
    mkdir -p /cluster/bluearc/canFam1/phastCons/wib
    cat << 'EOF' > doWigAsciiToBinary
#!/bin/csh -ef
set chr = $1
zcat `ls -1 /cluster/bluearc/canFam1/phastCons/POSTPROBS/$chr.*.pp.gz \
      | sort -t\. -k2,2n` \
| wigAsciiToBinary -chrom=$chr \
    -wibFile=/cluster/bluearc/canFam1/phastCons/wib/${chr}_phastCons stdin 
'EOF'
# << for emacs
    chmod a+x doWigAsciiToBinary
    rm -f jobList
    foreach chr (`ls -1 /cluster/bluearc/canFam1/phastCons/POSTPROBS \
                  | awk -F\. '{print $1}' | sort -u`)
      echo doWigAsciiToBinary $chr >> jobList
    end
    para create jobList
    para try, check, push, check, ...
#Completed: 41 of 41 jobs
#Average job time:                 502s       8.37m     0.14h    0.01d
#Longest job:                     1194s      19.90m     0.33h    0.01d
#Submission to last job:          1459s      24.32m     0.41h    0.02d

    # back on kksilo, copy wibs, wigs and POSTPROBS (people sometimes want 
    # the raw scores) from bluearc
    cd /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons
    rm -rf wib POSTPROBS
    rsync -av /cluster/bluearc/canFam1/phastCons/wib .
    rsync -av /cluster/bluearc/canFam1/phastCons/POSTPROBS .

    # load wiggle component of Conservation track
    ssh hgwdev
    rm -rf /gbdb/canFam1/wib/mzHg17Mm5_phast2
    mkdir -p /gbdb/canFam1/wib/mzHg17Mm5_phast2
    cd /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons
    chmod 775 . wib
    chmod 664 wib/*.wib
    ln -s `pwd`/wib/*.wib /gbdb/canFam1/wib/mzHg17Mm5_phast2/
    hgLoadWiggle canFam1 mzHg17Mm5_phast2_wig \
      -pathPrefix=/gbdb/canFam1/wib/mzHg17Mm5_phast2 wib/*.wig

    # make top-5000 list and launcher on Adam's home page:
    sed -e 's/lod=//' all.bed | sort -k4,4nr | head -5000 \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%d\n", $1, $2, $3, $4, $4}' \
    > top5000.bed
    /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \
      /cse/grads/acs/public_html/cf-top5000 \
      "top 5000 conserved elements" canFam1

    # and clean up bluearc.
    rm -r /cluster/bluearc/canFam1/phastCons
    rm -r /cluster/bluearc/canFam1/chrom


# NEW REPEATMASKER (DONE 10/5/04 angie)
    # Use the new version of RepeatMasker, load up as rmskNew, 
    # compare results w/orig.

    #- contigs have already been split into 500kb chunks, at gaps if possible.

    #- Make the run directory and job list:
    ssh kksilo
    cd /cluster/data/canFam1
    cat << '_EOF_' > jkStuff/NewRMDog
#!/bin/csh -fe

cd $1
pushd .
/bin/mkdir -p /tmp/canFam1/$2
/bin/cp $2 /tmp/canFam1/$2/
cd /tmp/canFam1/$2
/cluster/bluearc/RepeatMasker.040909/RepeatMasker -s -spec dog $2
popd
/bin/cp /tmp/canFam1/$2/$2.out ./$2.new.out
/bin/rm -fr /tmp/canFam1/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/canFam1/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/canFam1
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/NewRMDog
    mkdir NewRMRun
    cp /dev/null NewRMRun/RMJobs
    foreach c (`cat chrom.lst`)
      foreach d ($c/chr${c}_?{,?})
          set ctg = $d:t
          foreach f ( $d/${ctg}_?{,?}.fa )
            set f = $f:t
            echo /cluster/data/canFam1/jkStuff/NewRMDog \
                 /cluster/data/canFam1/$d $f \
               '{'check out line+ /cluster/data/canFam1/$d/$f.new.out'}' \
              >> NewRMRun/RMJobs
          end
      end
    end

    #- Do the run
    ssh kk
    cd /cluster/data/canFam1/NewRMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...
#Completed: 6373 of 6373 jobs
#CPU time in finished jobs:   26081453s  434690.89m  7244.85h  301.87d  0.827 y
#IO & Wait Time:                281283s    4688.05m    78.13h    3.26d  0.009 y
#Average job time:                4137s      68.94m     1.15h    0.05d
#Longest job:                     6392s     106.53m     1.78h    0.07d
#Submission to last job:         50241s     837.35m    13.96h    0.58d
    # Some jobs crashed, but succeeded after repushing --> longer total time.

    #- Lift up the 500KB chunk .new.out's to 5MB ("pseudo-contig") level
    ssh kksilo
    cd /cluster/data/canFam1
    foreach d (*/chr*_?{,?})
      set contig = $d:t
      echo $contig
      liftUp $d/$contig.fa.new.out $d/$contig.lft warn $d/${contig}_*.fa.new.out \
        > /dev/null
    end

    #- Lift pseudo-contigs to chromosome level
    foreach c (`cat chrom.lst`)
      echo lifting $c
      cd $c
      liftUp chr$c.fa.new.out lift/ordered.lft warn `sed -e 's/\.out$/.new.out/' lift/oOut.lst` > /dev/null
      cd ..
    end

    #- Load the .out files into the database with:
    ssh hgwdev
    cd /cluster/data/canFam1
    # Doh, hgLoadOut can't deal with too many .suffixes, abbreviate:
    foreach f (*/chr*.fa.new.out)
      mv $f $f:r:r:r.new.out
    end
    hgLoadOut -table=rmskNew canFam1 */chr*.new.out
    # chr22 looks like the winner here, send example to Robert Hubley:
#Strange perc. field -24.8 line 5580 of 10/chr10.new.out
#Strange perc. field -1.8 line 5580 of 10/chr10.new.out
#Strange perc. field -0.2 line 27436 of 13/chr13.new.out
#Strange perc. field -0.1 line 27436 of 13/chr13.new.out
#Strange perc. field -0.9 line 7302 of 2/chr2.new.out
#Strange perc. field -5.3 line 15894 of 2/chr2.new.out
#Strange perc. field -4.1 line 15894 of 2/chr2.new.out
#Strange perc. field -677.0 line 78121 of 22/chr22.new.out
#Strange perc. field -174.9 line 78121 of 22/chr22.new.out
#Strange perc. field -11.7 line 85036 of 23/chr23.new.out
#Strange perc. field -0.8 line 85036 of 23/chr23.new.out
#Strange perc. field -2.3 line 17185 of 27/chr27.new.out
#Strange perc. field -5.0 line 34029 of 27/chr27.new.out
#Strange perc. field -1.8 line 26743 of 29/chr29.new.out
#Strange perc. field -22.6 line 90657 of 4/chr4.new.out
#Strange perc. field -1.1 line 50705 of 5/chr5.new.out
    nice featureBits canFam1 rmskNew
#901188175 bases of 2359845093 (38.188%) in intersection
    # Yep, pretty close to original rmsk coverage:
#896773874 bases of 2359845093 (38.001%) in intersection


# TWINSCAN (DONE 11/9/04 angie)
    ssh kksilo
    mkdir /cluster/data/canFam1/bed/twinscan
    cd /cluster/data/canFam1/bed/twinscan
    wget http://genes.cs.wustl.edu/predictions/dog/canFam1/canFam1_TS13_pseudomasked.tar.gz
    tar xvzf canFam1_TS13_pseudomasked.tar.gz
    # Add '.a' to end of protein fasta id's, to match gtf transcript_id's:
    cp /dev/null twinscanPep.fa
    perl -wpe 's/^(>\S+).*/$1.a/' chr_ptx/*.ptx > twinscanPep.fa
    # load.
    ssh hgwdev
    cd /cluster/data/canFam1/bed/twinscan
    ldHgGene -gtf -genePredExt canFam1 twinscan chr_gtf/chr*.gtf
    hgPepPred canFam1 generic twinscanPep twinscanPep.fa
    featureBits -enrichment canFam1 xenoRefGene twinscan
#xenoRefGene 1.676%, twinscan 1.384%, both 1.030%, cover 61.42%, enrich 44.39x

# QA NOTE (ASZ 6-11-2007): sudo mytouch canFam1 twinscanPep 200411091200.00

# Pick up photo from NHGRI (DONE - 2004-12-22 - Hiram)
    ssh hgwdev
    cd /tmp
    wget \
	'http://www.genome.gov/Pages/News/Photos/Science/dog_image.jpg'
	-O dog_image.jpg
    #	view that image in 'display' to determine crop edges, then:
    convert -crop 952x1256+72+280 -quality 80 -sharpen 0 \
	-normalize dog_image.jpg ccf.jpg
    convert -geometry 300x200 -quality 80 ccf.jpg Canis_familiaris.jpg

    cp -p Canis_familiaris.jpg /usr/local/apache/htdocs/images
    rm -f ccf.jpg dog_image.jpg
    #	add links to this image in the description.html page, request push

# MAKE Human Proteins track (DONE 2005-1-15 braney)
    ssh kksilo
    mkdir -p /cluster/data/canFam1/blastDb
    cd /cluster/data/canFam1/blastDb
    #cut -f 1 ../chrom.sizes | sed "s/chr//" | sed "/NA/d" | sed "/Un/d" > chrom.list
    for i in `cat ../chrom.lst`; do ls -1 ../$i/*/*.fa . ; done | sed -n "/.*_.*_.*_.*/p" > list
    for i in `cat list`
    do
	ln -s $i .
	formatdb -i `basename $i` -p F
    done
    rm *.log *.fa list
    cd ..
    for i in `cat chrom.lst`; do cat  $i/chr*/*.lft  ; done > jkStuff/subChr.lft

    ssh kkr1u00
    mkdir -p /iscratch/i/canFam1/blastDb
    cd /cluster/data/canFam1/blastDb
    for i in nhr nin nsq; do cp *.$i /iscratch/i/canFam1/blastDb     ; echo $i; done

    cd
    (iSync) > sync.out

    mkdir -p /cluster/data/canFam1/bed/tblastn.hg17KG
    cd /cluster/data/canFam1/bed/tblastn.hg17KG
    echo  /iscratch/i/canFam1/blastDb/*.nsq  | xargs ls -S | sed "s/\.nsq//"  > query.lst  
    # back to kksilo
    exit

    # we want around 250000 jobs
    cd /cluster/data/canFam1/bed/tblastn.hg17KG
    calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(500000/`wc query.lst | awk "{print \\\$1}"`\)
    # 42156/(500000/6373) = 537.320376
    mkdir -p /cluster/bluearc/canFam1/bed/tblastn.hg17KG/kgfa
    split -l 537 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/canFam1/bed/tblastn.hg17KG/kgfa/kg
    ln -s /cluster/bluearc/canFam1/bed/tblastn.hg17KG/kgfa kgfa
    cd kgfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir -p /cluster/bluearc/canFam1/bed/tblastn.hg17KG/blastOut
    ln -s /cluster/bluearc/canFam1/bed/tblastn.hg17KG/blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 ../../jkStuff/subChr.lft carry $f.2       
        liftUp -nosort -type=".psl" -nohead $f.4 ../../jkStuff/liftAll.lft carry $f.3       
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4       
        if pslCheck -prot $3.tmp                                                                          
        then                                                                                              
            mv $3.tmp $3                                                                                  
            rm -f $f.1 $f.2 $f.3 $f.4                                                                              
        fi
        exit 0                                                                                            
    fi                                                                                                    
fi                                                                                                        
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4      
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/canFam1/bed/tblastn.hg17KG
    para create blastSpec
    para push
# Completed: 503467 of 503467 jobs
# CPU time in finished jobs:  101853630s 1697560.49m 28292.67h 1178.86d  3.230 y
# IO & Wait Time:              21902864s  365047.74m  6084.13h  253.51d  0.695 y
# Average job time:                 246s       4.10m     0.07h    0.00d
# Longest job:                     1208s      20.13m     0.34h    0.01d
# Submission to last job:        204867s    3414.45m    56.91h    2.37d

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q."$2"* | simpleChain -prot -outPsl -maxGap=200000 stdin ../c.`basename $1`.$2.psl)
'_EOF_'
    chmod +x chainOne

    for j in blastOut/kg??; do for i in `cat ../../chrom.lst`; do echo chainOne $j chr"$i"; done ; done > chainSpec

    ssh kki
    cd /cluster/data/canFam1/bed/tblastn.hg17KG
    para create chainSpec
    para push
# Completed: 3239 of 3239 jobs
# CPU time in finished jobs:    2859185s   47653.08m   794.22h   33.09d  0.091 y
# IO & Wait Time:                212945s    3549.09m    59.15h    2.46d  0.007 y
# Average job time:                 948s      15.81m     0.26h    0.01d
# Longest job:                    49695s     828.25m    13.80h    0.58d
# Submission to last job:        227112s    3785.20m    63.09h    2.63d

    exit
    # back to kksilo
    cd /cluster/data/canFam1/bed/tblastn.hg17KG/blastOut
    for i in kg??
    do 
	cat c.$i.*.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n  | uniq  > /cluster/data/canFam1/bed/tblastn.hg17KG/blastHg17KG.psl
    cd ..
    ssh hgwdev
    cd /cluster/data/canFam1/bed/tblastn.hg17KG
    hgLoadPsl canFam1 blastHg17KG.psl

    # back to kksilo
    rm -rf blastOut
# End tblastn

# BLASTZ BOREOEUTHERIAN (BOREUT1) (DONE 1/29/05 braney)
    ssh kk
    mkdir /cluster/data/borEut1/bed/zb.canFam1 
    ln -s /cluster/data/borEut1/bed/zb.canFam1 /cluster/data/canFam1/bed/blastz.borEut1
    cd /cluster/data/canFam1/bed/blastz.borEut1
    # Use default (Human-Mouse) settings for starters.
    cat << '_EOF_' > DEF
# dog vs. borEut
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Default
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Dog
SEQ1_DIR=/iscratch/i/canFam1/nib
SEQ1_RMSK=
SEQ1_FLAG=
# SEQ1_SMSK=/scratch/hg/gs.18/build35/linSpecRep.notInDog
SEQ1_SMSK=
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: BorEut
SEQ2_DIR=/iscratch/i/borEut1/nib
SEQ2_RMSK=
SEQ2_FLAG=
#SEQ2_SMSK=/scratch/hg/canFam1/linSpecRep.notInHuman
SEQ2_SMSK=
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/canFam1/bed/blastz.borEut1

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy

    /cluster/data/hg17/jkStuff/BlastZ_run0.sh
    cd run.0
    para push
# Completed: 2200 of 2200 jobs
# CPU time in finished jobs:     720384s   12006.40m   200.11h    8.34d  0.023 y
# IO & Wait Time:                 21690s     361.50m     6.03h    0.25d  0.001 y
# Average job time:                 337s       5.62m     0.09h    0.00d
# Longest job:                     2099s      34.98m     0.58h    0.02d
# Submission to last job:          2267s      37.78m     0.63h    0.03d

    ssh kki
    cd /cluster/data/canFam1/bed/blastz.borEut1
    /cluster/data/hg17/jkStuff/BlastZ_run1.sh

    cd run.1
    para push
# Completed: 275 of 275 jobs
# CPU time in finished jobs:        243s       4.06m     0.07h    0.00d  0.000 y
# IO & Wait Time:                   774s      12.89m     0.21h    0.01d  0.000 y
# Average job time:                   4s       0.06m     0.00h    0.00d
# Longest job:                        8s       0.13m     0.00h    0.00d
# Submission to last job:            78s       1.30m     0.02h    0.00d

    ssh kk
    cd /cluster/data/canFam1/bed/blastz.borEut1
    /cluster/data/hg17/jkStuff/BlastZ_run2.sh
    cd run.2
    para push

# Completed: 41 of 41 jobs
# CPU time in finished jobs:        266s       4.43m     0.07h    0.00d  0.000 y
# IO & Wait Time:                   822s      13.70m     0.23h    0.01d  0.000 y
# Average job time:                  27s       0.44m     0.01h    0.00d
# Longest job:                       54s       0.90m     0.01h    0.00d
# Submission to last job:            58s       0.97m     0.02h    0.00d

# END BLASTZ BOREOEUTHERIAN

# NEW NEW REPEATMASKER (DONE 1/27/05 angie)
    # Use the new version of RepeatMasker w/new dog lib, load up as rmskNew, 
    # compare results w/orig.

    #- contigs have already been split into 500kb chunks, at gaps if possible.

    #- Make the run directory and job list:
    ssh kksilo
    cd /cluster/data/canFam1
    cat << '_EOF_' > jkStuff/NewRMDog
#!/bin/csh -fe

cd $1
pushd .
/bin/mkdir -p /tmp/canFam1/$2
/bin/cp $2 /tmp/canFam1/$2/
cd /tmp/canFam1/$2
/cluster/bluearc/RepeatMasker050112/RepeatMasker -s -spec dog $2
popd
/bin/cp /tmp/canFam1/$2/$2.out ./$2.new.out
/bin/rm -fr /tmp/canFam1/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/canFam1/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/canFam1
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/NewRMDog
    mkdir NewRMRun
    cp /dev/null NewRMRun/RMJobs
    foreach c (`cat chrom.lst`)
      foreach d ($c/chr${c}_?{,?})
          set ctg = $d:t
          foreach f ( $d/${ctg}_?{,?}.fa )
            set f = $f:t
            echo /cluster/data/canFam1/jkStuff/NewRMDog \
                 /cluster/data/canFam1/$d $f \
               '{'check out line+ /cluster/data/canFam1/$d/$f.new.out'}' \
              >> NewRMRun/RMJobs
          end
      end
    end

    #- Do the run
    ssh kk
    cd /cluster/data/canFam1/NewRMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...
#Completed: 4676 of 4676 jobs
#CPU time in finished jobs:   23753156s  395885.93m  6598.10h  274.92d  0.753 y
#IO & Wait Time:                122587s    2043.12m    34.05h    1.42d  0.004 y
#Average job time:                5106s      85.10m     1.42h    0.06d
#Longest job:                    11286s     188.10m     3.13h    0.13d
#Submission to last job:         73093s    1218.22m    20.30h    0.85d

    #- Lift up the 500KB chunk .new.out's to 5MB ("pseudo-contig") level
    ssh kksilo
    cd /cluster/data/canFam1
    foreach d (*/chr*_?{,?})
      set contig = $d:t
      echo $contig
      liftUp $d/$contig.fa.new.out $d/$contig.lft warn $d/${contig}_*.fa.new.out \
        > /dev/null
    end

    #- Lift pseudo-contigs to chromosome level
    foreach c (`cat chrom.lst`)
      echo lifting $c
      cd $c
      liftUp chr$c.fa.new.out lift/ordered.lft warn `sed -e 's/\.out$/.new.out/' lift/oOut.lst` > /dev/null
      cd ..
    end

    #- Load the .out files into the database with:
    ssh hgwdev
    cd /cluster/data/canFam1
    # Doh, hgLoadOut can't deal with too many .suffixes, abbreviate:
    foreach f (*/chr*.fa.new.out)
      mv $f $f:r:r:r.new.out
    end
    hgLoadOut -table=rmskNew canFam1 */chr*.new.out
     # Lots of these warnings:
#Strange perc. field -29.7 line 40263 of 7/chr7.new.out
#Strange perc. field -29.7 line 40266 of 7/chr7.new.out
    # And this new one:
#note: 27 records dropped due to repStart > repEnd
#      run with -verbose=2 for details

   nice featureBits canFam1 rmskNew
#948099821 bases of 2359845093 (40.176%) in intersection
    # Good, coverage has increased since original:
#896773874 bases of 2359845093 (38.001%) in intersection


# MAKE BETTER-MASKED SEQUENCE FOR SELF ALIGNMENTS (DONE 1/27/05 angie)
    # I tried self alignments with the sequence masked by the original 
    # RepeatMasker run, but some jobs ran for >1 day and then crashed 
    # due to some heinous chrUn sequence, presumably undermasked.  
    # Since we just ran the new RepeatMasker and coverage improved a 
    # bit, try making a separate newly-masked copy of the sequence and 
    # using that in the blastz self run.  The increase in coverage for 
    # chrUn was pretty significant (yikes, look where it was already):
    # featureBits -chrom=chrUn canFam1 rmsk
#39357392 bases of 69040064 (57.007%) in intersection
    # featureBits -chrom=chrUn canFam1 rmskNew
#47147529 bases of 69040064 (68.290%) in intersection
    # So the problematic parts of chrUn may still run forever and crash,
    # but if they do, at least we did our best.  :)
    ssh kkr1u00
    mkdir /iscratch/i/canFam1/newRMNib
    cd /iscratch/i/canFam1/newRMNib
    set trfDir = /cluster/data/canFam1/bed/simpleRepeat/trfMaskChrom
    foreach f (/cluster/data/canFam1/*/chr*.fa)
      set chr = $f:t:r
      set newRmOut = $f:r.new.out
      echo $chr
      maskOutFa -soft $f $newRmOut $chr.fa
      maskOutFa -softAdd $chr.fa $trfDir/$chr.bed $chr.fa
      faToNib -softMask $chr.fa $chr.nib
    end
    rm chr*.fa
    iSync


# BLASTZ SELF (DONE 2/2/05 angie)
    ssh kk
    mkdir -p /cluster/data/canFam1/bed/blastz.canFam1.2005-01-27
    cd /cluster/data/canFam1/bed/blastz.canFam1.2005-01-27
    ln -s blastz.canFam1.2005-01-27 /cluster/data/canFam1/bed/blastz.canFam1
    cat << '_EOF_' > DEF
# dog vs. dog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

# TARGET
# Dog
SEQ1_DIR=/iscratch/i/canFam1/newRMNib
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Dog
SEQ2_DIR=/iscratch/i/canFam1/newRMNib
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=10000

BASE=/cluster/data/canFam1/bed/blastz.canFam1.2005-01-27

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line makes emacs coloring happy

    cp /cluster/data/canFam1/chrom.sizes S1.len
    cp /cluster/data/canFam1/chrom.sizes S2.len
    mkdir run
    cd run
    partitionSequence.pl 10000000 10000 /iscratch/i/canFam1/newRMNib ../S1.len \
      -xdir xdir.sh -rawDir ../lav \
    > canFam1.lst
    csh -ef xdir.sh
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/scripts/blastz-run-ucsc $(path1) $(path2) ../DEF {check out line ../lav/$(file1)/$(file1)_$(file2).lav}
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    gensub2 canFam1.lst canFam1.lst gsub jobList
    para create jobList
    para try, check, push, check, ...
    # Tolerating some jobs that crashed blastz due to nasty sequence.  
    # All were on chunks in chrUn:10000000-90010000 to themselves or 
    # each other, so just include a note on the details page that results 
    # are incomplete in that region.
#Completed: 75569 of 75625 jobs
#Crashed: 56 jobs
#CPU time in finished jobs:   13811407s  230190.12m  3836.50h  159.85d  0.438 y
#IO & Wait Time:                295434s    4923.90m    82.06h    3.42d  0.009 y
#Average job time:                 187s       3.11m     0.05h    0.00d
#Longest job:                    65494s    1091.57m    18.19h    0.76d
#Submission to last job:        411521s    6858.68m   114.31h    4.76d

    # convert lav files to psl 
    # First, combine all files per partition (too many files per chrom
    # to do in one swoop!).  Then do another round to collect all files 
    # per chrom.  
    ssh kki
    cd /cluster/data/canFam1/bed/blastz.canFam1.2005-01-27
    mkdir pslChrom pslParts run.lavToPsl
    cd run.lavToPsl
    ls -1d ../lav/* | sed -e 's@/$@@' > parts.lst
    # For self alignments, we need lavToAxt's -dropSelf behavior, 
    # so go lav->axt->psl... could add -dropSelf to lavToPsl, but that 
    # might be somewhat invasive and perhaps not really much faster (?) 
    # because the sequence must be dug up in order to rescore...
    cat << '_EOF_' > do.csh
#!/bin/csh -ef
cat $1/*.lav \
| lavToAxt -dropSelf stdin /iscratch/i/canFam1/newRMNib \
                           /iscratch/i/canFam1/newRMNib stdout \
| axtToPsl stdin ../S1.len ../S2.len stdout \
| sort -k 14,14 -k 16n,17n \
| gzip -c > $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x do.csh
    cat << '_EOF_' > gsub
#LOOP
./do.csh $(path1) {check out exists ../pslParts/$(file1).psl.gz }
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 parts.lst single gsub jobList
    para create jobList
    para try, check, push, check, ...
#Completed: 275 of 275 jobs
#Average job time:                  35s       0.59m     0.01h    0.00d
#Longest job:                      417s       6.95m     0.12h    0.00d
#Submission to last job:          1017s      16.95m     0.28h    0.01d
    awk '{print $1;}' /cluster/data/canFam1/chrom.sizes > chroms.lst
    cat << '_EOF_' > doChrom.csh
#!/bin/csh -ef
zcat ../pslParts/$1.*.psl.gz \
| sort -k 14,14 -k 16n,17n \
| gzip -c > $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChrom.csh
    cat << '_EOF_' > gsub.chroms
#LOOP
./doChrom.csh $(root1) {check out exists ../pslChrom/$(root1).psl.gz }
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 chroms.lst single gsub.chroms jobList
    para create jobList
    para try, check, push, check, ...
    # This would have been done in 97s if not for monster chrUn!
#Completed: 41 of 41 jobs
#Average job time:                  42s       0.70m     0.01h    0.00d
#Longest job:                      928s      15.47m     0.26h    0.01d
#Submission to last job:           987s      16.45m     0.27h    0.01d


# CHAIN SELF BLASTZ (DONE 2/3/05 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/canFam1/bed/blastz.canFam1.2005-01-27
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S ../../pslChrom/*.psl.gz > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh -ef
setenv PATH $HOME/bin/x86_64:$PATH
axtChain -verbose=0 -psl $1 /iscratch/i/canFam1/newRMNib \
                            /iscratch/i/canFam1/newRMNib stdout \
| chainAntiRepeat /iscratch/i/canFam1/newRMNib /iscratch/i/canFam1/newRMNib \
    stdin $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
    # chrUn crashed it... too bad for chrUn (not viewed as interesting 
    # for self chains anyway).
#Completed: 40 of 41 jobs
#Crashed: 1 jobs
#Average job time:                 185s       3.09m     0.05h    0.00d
#Longest job:                      490s       8.17m     0.14h    0.01d
#Submission to last job:         74789s    1246.48m    20.77h    0.87d

    # now on the cluster server, sort chains
    ssh kolossus
    cd /cluster/data/canFam1/bed/blastz.canFam1.2005-01-27/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=10000 /tmp/score.$f:t:r
      echo ""
    end

    # trim to minScore=20000 to cut some of the fluff
    mkdir chainFilt
    foreach f (chain/*.chain)
      chainFilter -minScore=20000 $f > chainFilt/$f:t
    end
    gzip -c all.chain > all.chain.unfiltered.gz
    rm -r chain
    mv chainFilt chain
    chainMergeSort -saveId chain/*.chain > all.chain

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/canFam1/bed/blastz.canFam1.2005-01-27/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain canFam1 ${c}_chainSelf $i
    end


# NET SELF BLASTZ (DONE 2/3/05 angie)
    ssh kolossus
    cd /cluster/data/canFam1/bed/blastz.canFam1.2005-01-27/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/canFam1/bed/blastz.canFam1.2005-01-27/axtChain
    netClass -noAr noClass.net canFam1 canFam1 self.net
    rm noClass.net
    # Load the nets into database 
    netFilter -minGap=10 self.net |  hgLoadNet canFam1 netSelf stdin
    # Add entries for chainSelf, netSelf to dog/canFam1 trackDb


# MAKE VSSELF DOWNLOADABLES (DONE 2/3/05 angie)
    ssh kolossus
    cd /cluster/data/canFam1/bed/blastz.canFam1.2005-01-27/axtChain
    gzip all.chain self.net

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/canFam1/vsSelf
    cd /usr/local/apache/htdocs/goldenPath/canFam1/vsSelf
    cp -p /cluster/data/canFam1/bed/blastz.canFam1/axtChain/all.chain.gz \
      self.chain.gz
    cp -p /cluster/data/canFam1/bed/blastz.canFam1/axtChain/self.net.gz .
    md5sum *.gz > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.



#########################################################################
# MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram)

# BAC END PAIRS TRACK (DONE, 2005-11-14, hartera)

    # Use latest BAC ends data from NCBI
    ssh kolossus
    cd /cluster/data/ncbi
    mkdir -p bacends/dog/bacends.1
    cd bacends/dog/bacends.1
    # go to the trace archive at:
    # http://www.ncbi.nlm.nih.gov/Traces/trace.cgi
    # Query: species_code='canis familiaris' and center_name='abc'
    # there are 393408 items and can download 40000 items at a time so
    # download the perl scrip to query the trace archive.
    # see http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=doc&m=obtain&s=stips#download
    # for instructions.
    wget --timestamping \
         ftp://ftp.ncbi.nih.gov/pub/TraceDB/misc/query_tracedb
    chmod +x query_tracedb
    ssh kolossus
    cd 
    cat > queryTraceDogBacEnds.csh <<'_EOF_'
#!/bin/csh -fe
foreach n (0 1 2 3 4 5 6 7 8 9)
   echo "Fetching page $n ..."
   (echo -n "retrieve_tgz all 0b"; query_tracedb "query page_size 40000 page_number $n binary species_code='CANIS FAMILIARIS' and center_name='abc'") | query_tracedb > data${n}.tgz 
end
'_EOF_'
    chmod +x queryTraceDogBacEnds.csh
    nohup nice queryTraceDogBacEnds.csh &
    # Took about 5.5 hours.

    wget --timestamping \
         ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/canis_familiaris/9615_clone_ends001.mfa.gz
    wget --timestamping \
         ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/canis_familiaris/9615_clone_info001.txt.gz
    wget --timestamping \
         ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/README
    nice gunzip *.gz
    foreach n (1 2 3 4 5 6 7 8 9)
       nice tar xvf data${n}.tar
       nice rm -r */CANIS_FAMILIARIS/ABC/traces
       nice rm -r */CANIS_FAMILIARIS/ABC/qscore 
    end
    
    foreach f (*/CANIS_FAMILIARIS/ABC/fasta/*.fasta)
       nice cat $f >> canFamBacends.fa
    end
    # took almost 3 hours 
    # change FASTA record to have only the trace name in the header line.
    perl -pi.bak -e 's/>gnl\|ti\|[0-9]+ name:([0-9]+)/>$1/' canFamBacends.fa
    mv canFamBacends.fa.bak bacEndSeqs.fa
    grep '>' canFamBacends.fa | wc -l
    # 393408 - this is the correct number of sequences 
    cd /cluster/data/ncbi/bacends/dog/bacends.1
    # cat together the xml files of BAC clone end information
    cat */*.xml > CHORI82BacEnds.xml
    # clean up
    rm -r 2005*
    mkdir -p /san/sanvol1/scratch/canFam1/bacends
    faSplit sequence /cluster/bluearc/canFam1/bacends/canFamBacends.fa \
            100 /san/sanvol1/scratch/canFam1/bacends/bacends
    ssh kkstore01
    cd /cluster/data/ncbi/bacends/dog/bacends.1
    # get mate-pair information from xml, forward is SP6, reverse is T7
cat > getBacInfo.pl << EOF
#!/usr/bin/perl -w
use strict;

my ($name, $clone, $dir);

while (<STDIN>)
{
chomp;
my $l = $_;
if ($l =~ /<trace_name>([0-9]+)/)
   {
   $name = $1;
   }
elsif ($l =~ /<clone_id>(CH82\-[0-9A-Z]+)/)
   {
   $clone = $1;
   }
elsif ($l =~ /<trace_end>(F|R)/)
   {
   $dir = $1;
   # print out end information
   print "$clone\t$name\t0\tCHORI-82\t0";
   if ($dir eq "F")
      {
      print "\tSP6\n";
      }
   elsif ($dir eq "R")
      {
      print "\tT7\n";
      }
   }
}
EOF
    # << for emacs
    chmod +x getBacInfo.pl
    perl getBacInfo.pl < CHORI82BacEnds.xml > bacEndInfo.txt
    # check all the names are there 
    grep '>' canFamBacends.fa > names
    perl -pi.bak -e 's/>//' names
    sort names | uniq > names.sort
    awk '{print $2}' bacEndInfo.txt  | sort | uniq > bacEndInfo.names.sort
    comm -13 bacEndInfo.names.sort names.sort
# 17714318
# 17716422
# 17755662
# 17798575
# 17858308
# 17859375
# 17928748
# 17967720
# 17967763
# 18124464
# 18124606
# 18261519
    # these 12 are not in the xml file so download manually from the 
    # trace archive. Go to http://www.ncbi.nlm.nih.gov/Traces/trace.cgi 
    # type in search query e.g.
    # species_code='CANIS FAMILIARIS' and center_name='ABC' and (trace_name='17967763' or trace_name='18124464' or trace_name='18124606' or trace_name='18261519')
    # and save as Info-XML
    foreach n (1 2 3) 
      tar xvf trace${n}.tar
    end
    cat */*.xml >> CHORI82BacEnds.xml
    # then create file of BAC end info again
    rm bacEndInfo.txt
    perl getBacInfo.pl < CHORI82BacEnds.xml > bacEndInfo.txt
    # check all the names are there 
    awk '{print $2}' bacEndInfo.txt  | sort | uniq > bacEndInfo.names.sort
    comm -13 bacEndInfo.names.sort names.sort
    # these are both the same so all the names were retrieved from the xml file
    # cleanup 
    rm -r 2005* 
    rm *.bak *.sort names  
    # create mate-pair information
    /cluster/bin/scripts/convertBacEndPairInfo bacEndInfo.txt
    # 196031 pairs and 1346 singles
 
    # Do BLAT alignments of BAC ends to the genome.
    ssh pk
    # copy over masked contigs to the san
    mkdir -p /san/sanvol1/scratch/canFam1/maskedContigs
    rsync -a --progress kkr1u00:/iscratch/i/canFam1/maskedContigs/ \
             /san/sanvol1/scratch/canFam1/maskedContigs
    # copy over 11.ooc file to the san
    rsync -a --progress kkr1u00:/iscratch/i/canFam1/11.ooc \
             /san/sanvol1/scratch/canFam1
    # make output directory
    mkdir -p /san/sanvol1/scratch/canFam1/bacendsPsl
    # make run directory
    mkdir -p /san/sanvol1/scratch/canFam1/bacendsRun
    mkdir -p /cluster/data/canFam1/bed/bacends
    cd /cluster/data/canFam1/bed/bacends
    ln -s /san/sanvol1/scratch/canFam1/bacendsPsl
    ln -s /san/sanvol1/scratch/canFam1/bacendsRun
    cd /cluster/data/canFam1/bed/bacends/bacendsRun
    echo '#LOOP\n/cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/canFam1/11.ooc {check out line+ /san/sanvol1/scratch/canFam1/bacendsPsl/$(root1).$(root2).psl}\n#ENDLOOP' > gsub
    ls -1S /san/sanvol1/scratch/canFam1/maskedContigs/*.fa > contigs.lst
    ls -1S /san/sanvol1/scratch/canFam1/bacends/bacends*.fa > bacends.lst
    gensub2 contigs.lst bacends.lst gsub jobList
    para create jobList
    para try, check, push, check etc.
# para time
# Completed: 49203 of 49203 jobs
# CPU time in finished jobs:     185617s    3093.62m    51.56h    2.15d  0.006 y
# IO & Wait Time:                169691s    2828.18m    47.14h    1.96d  0.005 y
# Average job time:                   7s       0.12m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             212s       3.53m     0.06h    0.00d
# Submission to last job:          1473s      24.55m     0.41h    0.02d
    # lift alignments
    ssh kolossus
    cd /cluster/data/canFam1/bed/bacends
    pslSort dirs raw.psl tmp bacendsPsl
    # started 08:15, PID: 16561
    pslCheck raw.psl >& check.log 
    wc -l check.log
    # 2467 check.log so 2467 invalid alignments with overlapping blocks
    wc -l raw.psl
    # 27147459 raw.psl - 0.009% of alignments have overlapping blocks.
    pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
                raw.psl  bacEnds.psl /dev/null
    # Processed 27147454 alignments
    wc -l bacEnds.psl
    # 771166 bacEnds.psl
    # with pslCheck, 1010 of these have overlapping blocks, 0.008%
    liftUp bacEnds.lifted.psl /cluster/data/canFam1/jkStuff/liftAll.lft \
           warn bacEnds.psl
    awk '{print $10}' bacEnds.lifted.psl | sort | uniq | wc -l
    # 317098 without headers - 80.6% aligned, 393408 total
    # for unfiltered alignments, raw.psl, 85.7% aligned.
    # Make BAC end pairs track: 
    ssh kkstore03
    # create alignment pairs
    mkdir -p /cluster/data/canFam1/bed/bacends/pairs
    cd /cluster/data/canFam1/bed/bacends/pairs
    set ncbiDir = /cluster/data/ncbi/bacends/dog/bacends.1
    /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan -mismatch -verbose ../bacEnds.lifted.psl $ncbiDir/bacEndPairs.txt all_bacends bacEnds
    wc -l *
#     29 bacEnds.long
#    432 bacEnds.mismatch
#  46668 bacEnds.orphan
# 131858 bacEnds.pairs
#    499 bacEnds.short
#    805 bacEnds.slop

    # create header required by "rdb" tools
    echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' \
         > header
    echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header
    # edit header to make sure \t is/become tab character
    cat header bacEnds.pairs | row score ge 300 | sorttbl chr start \
        | headchg -del > bacEndPairs.bed
    # Loaded 131858 elements of size 11
    cat header  bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
        bacEnds.orphan | row score ge 300 | sorttbl chr start \
        | headchg -del > bacEndPairsBad.bed
    # Loaded 48406 elements of size 11
    extractPslLoad -noBin ../bacEnds.lifted.psl bacEndPairs.bed   \
        bacEndPairsBad.bed | sorttbl tname tstart | headchg -del \
        > bacEnds.load.psl
    # load into database
    ssh hgwdev
    cd /cluster/data/canFam1/bed/bacends/pairs
    hgLoadBed canFam1 bacEndPairs bacEndPairs.bed -notItemRgb \
               -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
    # note - this next track isn't pushed to RR, just used for assembly QA
    hgLoadBed canFam1 bacEndPairsBad bacEndPairsBad.bed -notItemRgb \
              -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
    hgLoadPsl canFam1 -table=all_bacends bacEnds.load.psl
    # load of all_bacends did not go as planned: 746545 record(s), 0 row(s) 
    # skipped, 433 warning(s) loading psl.tab 
    # all alignments were loaded into the table.
    # load BAC end sequences into seq table so alignments may be viewed
    # symlink to FASTA sequence file in ncbi directory
    mkdir -p /gbdb/canFam1/bacends
    ln -s /cluster/data/ncbi/bacends/dog/bacends.1/canFamBacends.fa \
          /gbdb/canFam1/bacends/canFamBacends.fa
    hgLoadSeq canFam1 /gbdb/canFam1/bacends/canFamBacends.fa
    # 393408 sequences
# featureBits canFam1 all_bacends
# 211644790 bases of 2359845093 (8.969%) in intersection
# featureBits canFam1 bacEndPairs
# 2334084046 bases of 2359845093 (98.908%) in intersection
# featureBits canFam1 bacEndPairsBad
# 548130287 bases of 2359845093 (23.227%) in intersection
# add trackDb entry and html
# Changes were made to hgc.c so that the BAC clone and end name links 
# go to the relevant entries in the NCBI Trace Archive.


# BROAD(/ENSEMBL/OXFORD) GENE ANNOTATIONS (DONE 2/15/06 angie)
    # These do not look good enough to push -- I just loaded them because 
    # David and Mark were interested.  They do not include CDS annotation 
    # and don't look very good compared to xenoRefGene.
    ssk kkstore03
    mkdir /cluster/data/canFam1/bed/broadGenes
    cd /cluster/data/canFam1/bed/broadGenes
    wget --timestamping http://www.broad.mit.edu/ftp/pub/papers/dog_genome/suppinfo/DogGenes_CanFam1.zip
    # Translate the three GFF-like formats in this file into vanilla GFF 
    # with transcript IDs for grouping:
    perl -wpe 's/ENSCAFE\d+\s+ENSCAFG\d+\s+(ENSCAFT\d+)\s+\w+\s*$/$1\n/; \
               s/\d+\.\d+\.\d+\s+\d+\s+// if /broad/; \
               s/\d+\s+\d+\s+(\d+)\s+\w+\s*$/$1\n/ if /oxford/; \
               s/^/chr/;' \
      dog_final.cdna.gff > cleaned.gff
    grep ensembl cleaned.gff > ensemblExons.gff
    grep broad cleaned.gff > broadExons.gff
    grep oxford cleaned.gff > oxfordExons.gff

    ssh hgwdev
    cd /cluster/data/canFam1/bed/broadGenes
    ldHgGene canFam1 ensemblExons ensemblExons.gff
    ldHgGene canFam1 broadExons broadExons.gff
    ldHgGene canFam1 oxfordExons oxfordExons.gff


# ENSEMBL GENES (DONE 2/22/06 angie)
    mkdir /cluster/data/canFam1/bed/ensembl
    cd /cluster/data/canFam1/bed/ensembl
    # Get the ensembl gene data from 
    # http://www.ensembl.org/Canis_familiaris/martview
    # Follow this sequence through the pages:
    # Page 1) Make sure that the Canis_familiaris choice is selected. Hit next.
    # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
    # Page 3) Choose the "Structures" box. 
    # Page 4) Choose GTF as the ouput.  choose gzip compression.  hit export.
    # Save as ensembl.gff.gz
    # Add "chr" to front of each line in the gene data gtf file to make 
    # it compatible with our software.
    gunzip -c ensembl.gff.gz \
    | perl -wpe 's/^([0-9]+|X|Y|M|Un(_random)?)/chr$1/ \
                 || die "Line $. doesnt start with rat chrom:\n$_"; \
                 s/chrMT/chrM/;' \
    > ensGene.gtf
    ssh hgwdev
    ldHgGene -gtf -genePredExt canFam1 ensGene \
      /cluster/data/canFam1/bed/ensembl/ensGene.gtf

    # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
    # hgKnownToSuper.  Use ensMart to create it as above, except:
    # Page 3) Choose the "Features" box. In "Ensembl Attributes", check 
    # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
    # Choose Text, tab-separated as the output format.  Result name ensGtp.
    # Save file as ensGtp.txt.gz
    #-- Normally the versioned accs are available, to match what we get in 
    #-- the GTF.  However, this time the versions are missing... fortunately 
    #-- everything is version .1 this time so we can just fudge it in:
    gunzip -c ensGtp.txt.gz \
    | perl -wpe 's/(ENSCAF[GTP]\d+)/$1.1/g' > ensGtp.tab
    hgLoadSqlTab canFam1 ensGtp ~/kent/src/hg/lib/ensGtp.sql ensGtp.tab

    # Load Ensembl peptides:
    # Get them from ensembl as above in the gene section except for
    # Page 3) Choose the "Sequences" box. 
    # Page 4) Transcripts/Proteins.  Peptide.  Format = FASTA.
    # Save file as ensemblPep.fa.gz
    gunzip -c ensemblPep.fa.gz \
    | perl -wpe 's/^>.*\|(ENSCAFT\d+\.\d+).*/>$1/' \
    > ensPep.fa
    hgPepPred canFam1 generic ensPep ensPep.fa


##########################################################################
# NSCAN track - (2006-03-27 markd)
    cd /cluster/data/canFam1/bed/nscan/

    # obtainedf NSCAN predictions from michael brent's group
    # at WUSTL

    wget -nv http://genes.cse.wustl.edu/predictions/dog/canFam1/CanFam1_hg17_nscan_10_18_2005.tar.gz
    chmod a-w CanFam1_hg17_nscan_10_18_2005.tar.gz 
    tar -zxf  CanFam1_hg17_nscan_10_18_2005.tar.gz 

    # load tracks.  Note that these have *utr features, rather than
    # exon features.  currently ldHgGene creates separate genePred exons
    # for these.
    ldHgGene -bin -gtf -genePredExt canFam1 nscanGene  CanFam1_hg17_nscan_10_18_2005/chr_gtf/*.gtf
    # add .a suffix to match transcript id
    hgPepPred -suffix=.a canFam1 generic nscanPep CanFam1_hg17_nscan_10_18_2005/chr_ptx/*.ptx
    rm -rf CanFam1_hg17_nscan_10_18_2005 *.tab

    # update trackDb; need a canFam1-specific page to describe informants
    dog/canFam1/nscanGene.html    (from  CanFam1_hg17_nscan_10_18_2005/description.html )
    dog/canFam1/trackDb.ra


###########################################################################
# LIFTOVER CHAINS CANFAM1 -> CANFAM2 (DONE 7/10/06 angie)
    # Started 7/5 -- This creates the directory and scripts:
    ~/kent/src/utils/doSameSpeciesLiftOver.pl canFam1 canFam2 -debug
    # Run it for real and log it:
    cd /cluster/data/canFam1/bed/blat.canFam2.2006-07-05
    ~/kent/src/utils/doSameSpeciesLiftOver.pl canFam1 canFam2 \
       >& do.log & tail -f do.log
    # Had to use -continue a few times because I'm debugging the script,
    # and -buildDir /cluster/data/canFam1/bed/blat.canFam2.2006-07-05 when
    # the date changed, but it finished!


###########################################################################
# dbSNP BUILD 125 (Heather, August 2006)
# We didn't have ctgPos (I looked for an hour+ on the NCBI site, couldn't find it).
# dbSNP build 125 uses chrom coords (phys_pos_from and phys_pos) in ContigLoc
# but not for chrUn.  chrUn has contig coords only, and without ctgPos
# for supercontigs, I can't translate the chrUn contig coords into chrom coords.
# Also note: canFam1 has no chrY, and there were no SNPs for chrM.
# Not having ctgPos also meant I couldn't process the new locTypes 
# (4,5,6: rangeInsertion, rangeSubstitution, rangeDeletion)
# because those are also only given in contig coords
# These were a small number.

# Set up directory structure
ssh kkstore02
cd /cluster/data/dbSNP/125/organisms/dog_9615
mkdir data
mkdir schema
mkdir rs_fasta

# Get data from NCBI (anonymous FTP)
cd /cluster/data/dbSNP/125/organisms/dog_9615/data
ftp ftp.ncbi.nih.gov
cd snp/organisms/dog_9615/database/organism_data
# ContigLoc table has coords, orientation, loc_type, and refNCBI allele
get b125_SNPContigLoc_1_1.bcp.gz
# ContigLocusId has function
get b125_SNPContigLocusId_1_1.bcp.gz
get b125_ContigInfo_1_1.bcp.gz
# MapInfo has alignment weights
get b125_SNPMapInfo_1_1.bcp.gz
# SNP has univar_id, validation status and heterozygosity
get SNP.bcp.gz

# Get schema from NCBI
cd /cluster/data/dbSNP/125/organisms/dog_9615/schema
ftp ftp.ncbi.nih.gov
cd snp/organisms/dog_9615/database/organism_schema
get dog_9615_table.sql.gz

# Get fasta files from NCBI
# using headers of fasta files for molType
cd /cluster/data/dbSNP/125/organisms/dog_9615/rs_fasta
ftp ftp.ncbi.nih.gov
cd snp/organisms/dog_9615/rs_fasta
prompt
mget *.gz

# add rs_fasta to seq/extFile
# 2 edits first: strip header to just rsId, and remove duplicates
# work on /cluster/store12 (kkstore05) which has more disk space
cp rs_ch*.fas.gz /cluster/store12/snp/125/dog/rs_fasta
ssh kkstore05
cd /cluster/store12/snp/125/dog/rs_fasta
# concat into rsAll.fas
cat << '_EOF_' > concat.csh
#!/bin/csh -ef
rm -f rsAll.fas
foreach file (rs_ch*.fas)
    echo $file
    zcat $file >> rsAll.fas
end
'_EOF_' 
# snpCleanSeq strips the header and skips duplicates
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCleanSeq rsAll.fas snp.fa
rm rsAll.fas
# load on hgwdev
ssh hgwdev
mkdir /gbdb/canFam1/snp
ln -s /cluster/store12/snp/125/dog/rs_fasta/snp.fa /gbdb/canFam1/snp/snp.fa
cd /clsuter/store12/snp/125/dog/rs_fasta
hgLoadSeq canFam1 /gbdb/canFam1/snp/snp.fa

# look up id in extFile
# move into separate table
hgsql canFam1 < snpSeq.sql
hgsql -e 'insert into snpSeq select acc, file_offset from seq where extFile = 455951' canFam1
hgsql -e 'delete from seq where extFile = 455951' canFam1
hgsql -e 'alter table snpSeq add index acc (acc)' canFam1

# clean up after hgLoadSeq
rm seq.tab

# Simplify names of data files
cd /cluster/data/dbSNP/125/organisms/dog_9615/data
mv b125_ContigInfo_1_1.bcp.gz ContigInfo.gz
mv b125_SNPContigLoc_1_1.bcp.gz ContigLoc.gz
mv b125_SNPContigLocusId_1_1.bcp.gz ContigLocusId.gz
mv b125_SNPMapInfo_1_1.bcp.gz MapInfo.gz
mv SNP.bcp.gz SNP.gz
ls -1 *.gz > filelist

# edit table descriptions
cd /cluster/data/dbSNP/125/organisms/dog_9615/schema
# get CREATE statements from dog_9615_table.sql for our 5 tables
# store in table.tmp
# convert and rename tables
sed -f 'mssqlToMysql.sed' table.tmp > table2.tmp
rm table.tmp
sed -f 'tableRename.sed' table2.tmp > table.sql
rm table2.tmp

# get header lines from rs_fasta
cd /cluster/data/dbSNP/125/organisms/dog_9615/rs_fasta
/bin/csh gnl.csh

# load on kkr5u00
ssh kkr5u00
hgsql -e mysql 'create database canFam1snp125' 
cd /cluster/data/dbSNP/125/organisms/dog_9615/schema
hgsql canFam1snp125 < table.sql
cd ../data
/bin/csh load.csh

# note rowcount
# ContigLoc     10806955 
# SNP            3310003 
# MapInfo        3310003 
# ContigLocusId  1219541 

# Get UniVariation from 126 (recently downloaded for mm8snp126)
cd /cluster/data/dbSNP/126/shared
hgsql canFam1snp125 < UniVariation.sql
zcat UniVariation.bcp.gz | hgsql -e 'load data local infile "/dev/stdin" into table UniVariation' canFam1snp125

# create working /scratch dir
cd /scratch/snp/125
mkdir dog
cd dog

# no canFam1.ctgPos table to compare against

# get gnl files
cp /cluster/data/dbSNP/125/organisms/dog_9615/rs_fasta/*.gnl .

# examine ContigInfo for group_term and edit pipeline.csh
# use "ref_assembly" 

# filter ContigLoc into ContigLocFilter
# this lifts from contig coords to chrom coords
# phys_pos_from is used to check coords for non-random chroms
# errors reported to stdout
# this gets rid of alternate assemblies (using ContigInfo)
# this also gets rid of poor quality alignments (weight == 10 || weight == 0 in MapInfo)
# assumes all contigs are positively oriented; will abort if not true

mysql> desc ContigLocFilter;
#  +---------------+-------------+------+-----+---------+-------+
#  | Field         | Type        | Null | Key | Default | Extra |
#  +---------------+-------------+------+-----+---------+-------+
#  | snp_id        | int(11)     | NO   |     |         |       |
#  | ctg_id        | int(11)     | NO   |     |         |       |
#  | chromName     | varchar(32) | NO   |     |         |       |
#  | loc_type      | tinyint(4)  | NO   |     |         |       |
#  | start         | int(11)     | NO   |     |         |       |
#  | end           | int(11)     | YES  |     | NULL    |       |
#  | orientation   | tinyint(4)  | NO   |     |         |       |
#  | allele        | blob        | YES  |     | NULL    |       |
#  +---------------+-------------+------+-----+---------+-------+
 
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocFilter125 canFam1snp125 ref_assembly Dog1.0
# note rowcount
# ContigLocFilter  3124054
# how many are positive strand? hopefully 90%
# We get 100% here
mysql> select count(*) from ContigLocFilter where orientation = 0;
# 3124054
# note count by loc_type
mysql> select count(*), loc_type from ContigLocFilter group by loc_type;
# +----------+----------+
# | count(*) | loc_type |
# +----------+----------+
# |     2296 |        1 |
# |  2986459 |        2 |
# |   135299 |        3 |
# +----------+----------+

# filter ContigLocusId into ContigLocusIdFilter
# need to filter on contig_acc because ctg_id = 0 for all rows in ContigLocusId
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdFilter125 canFam1snp125 ref_assembly
# note rowcount 
# 1219541 (same as ContigLocusId)

# condense ContigLocusIdFilter into ContigLocusIdCondense (one SNP can have multiple functions)
# assumes SNPs are in numerical order; will errAbort if not true
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense canFam1snp125 
# note rowcount; expect about 50% (ascertainment bias for SNPs within genes)
# actually we get about 90% here
# ContigLocusIdCondense 1168248
# could delete ContigLocusIdFilter table here

# create chrN_snpFasta tables from *.gnl files
# we are just using molType, but also storing class and observed
# need chromInfo for this
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoadFasta canFam1snp125 

# (could start using pipeline.csh here)
# (pipeline.csh takes about 35 minutes to run)

# split ContigLocFilter by chrom 
# create the first chrN_snpTmp
# we will reuse this table name, adding/changing columns as we go
# at this point chrN_snpTmp will have the same description as ContigLocFilter
# this opens a file handle for every chrom, so will not scale to scaffold-based assemblies
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom canFam1snp125 ref_assembly

# check coords using loc_type
# possible errors logged to snpLocType.error:
# Unknown locType 
# Between with allele != '-' 
# Exact with end != start + 1
# Range with end < start 
# possible exceptions logged to snpLocType.exceptions:
# RefAlleleWrongSize
# This run no errors, no exceptions

# morph chrN_snpTmp 

mysql> desc chr1_snpTmp;

#  +---------------+-------------+------+-----+---------+-------+
#  | Field         | Type        | Null | Key | Default | Extra |
#  +---------------+-------------+------+-----+---------+-------+
#  | snp_id        | int(11)     | NO   |     |         |       |
#  | ctg_id        | int(11)     | NO   |     |         |       |
#  | chromStart    | int(11)     | NO   |     |         |       |
#  | chromEnd      | int(11)     | NO   |     |         |       |
#  | loc_type      | tinyint(4)  | NO   |     |         |       |
#  | orientation   | tinyint(4)  | NO   |     |         |       |
#  | allele        | blob        | YES  |     | NULL    |       |
#  +---------------+-------------+------+-----+---------+-------+

/cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoctype125 canFam1snp125 ref_assembly

# expand allele as necessary
# report syntax errors to snpExpandAllele.errors
# possible exceptions logged to snpExpandAllele.exceptions:
# RefAlleleWrongSize
# This run no errors, no exceptions
# 43 alleles expanded

/cluster/home/heather/kent/src/hg/snp/snpLoad/snpExpandAllele canFam1snp125 ref_assembly

# the next few steps prepare for working in UCSC space

# sort by position
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSort canFam1snp125 ref_assembly

# rename MT --> M (pipeline.csh takes care of this)
hgsql -e "rename table chrMT_snpTmp to chrM_snpTmp" canFam1snp125
# note that chrM_snpTmp is empty at this point

# get nib files 
ssh hgwdev
cd /cluster/data/canFam1/nib
cp *.nib /cluster/home/heather/transfer/snp/canFam1
ssh kkr5u00
cd /scratch/snp/125/dog
mkdir nib
cp /cluster/home/heather/transfer/snp/canFam1/*.nib nib
rm /cluster/home/heather/transfer/snp/canFam1/*.nib
# edit path in chromInfo, load into canFam1snp125 with editted path

# lookup reference allele in nibs
# keep reverse complement to use in error checking (snpCheckAlleles)
# check here for SNPs larger than 1024
# errAbort if detected
# check for coords that are too large, log to snpRefUCSC.error and skip
# This run no errors
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpRefUCSC canFam1snp125

# morph chrN_snpTmp 

mysql> desc chr1_snpTmp;

#  +--------------------+-------------+------+-----+---------+-------+
#  | Field              | Type        | Null | Key | Default | Extra |
#  +--------------------+-------------+------+-----+---------+-------+
#  | snp_id             | int(11)     | NO   |     |         |       |
#  | ctg_id             | int(11)     | NO   |     |         |       |
#  | chromStart         | int(11)     | NO   |     |         |       |
#  | chromEnd           | int(11)     | NO   |     |         |       |
#  | loc_type           | tinyint(4)  | NO   |     |         |       |
#  | orientation        | tinyint(4)  | NO   |     |         |       |
#  | allele             | blob        | YES  |     | NULL    |       |
#  | refUCSC            | blob        | YES  |     | NULL    |       |
#  | refUCSCReverseComp | blob        | YES  |     | NULL    |       |
#  +--------------------+-------------+------+-----+---------+-------+

# compare allele from dbSNP to refUCSC
# locType between is excluded from this check
# log exceptions to snpCheckAllele.exceptions
# if SNP is positive strand, expect allele == refUCSC
# log RefAlleleMismatch if not
# if SNP is negative strand, if not allele == refUCSC, then check for allele == refUCSCReverseComp
# If allele == refUCSCRevComp, log RefAlleleNotRevComp
# If allele doesn't match either of refUCSC or refUCSCReverseComp, log RefAlleleMismatch
# This run we got:
# 0 RefAlleleMismatch
# 2758   RefAlleleNotRevComp
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckAlleles canFam1snp125

# add class and observed using univar_id from SNP table
# to get class (subsnp_class) and observed (var_str) from UniVariation
# log errors to snpClassAndObserved.errors
# errors detected: 
# class = 0 in UniVariation
# class > 8 in UniVariation
# univar_id = 0 in SNP
# no row in SNP for snp_id in chrN_snpTmp
# This run we got:
# 3 class = 0 in UniVariation
# 0 class > 8 in UniVariation
# 0 univar_id = 0 in SNP 
# 0 no row in SNP for snp_id in chrN_snpTmp 
# dbSNP has class = 'in-del'
# we promote this to 'deletion' for locType 1&2 and to 'insertion' for locType 3

/cluster/home/heather/kent/src/hg/snp/snpLoad/snpClassAndObserved canFam1snp125

# morph chrN_snpTmp
#  +--------------------+---------------+------+-----+---------+-------+
#  | Field              | Type          | Null | Key | Default | Extra |
#  +--------------------+---------------+------+-----+---------+-------+
#  | snp_id             | int(11)       | NO   |     |         |       |
#  | chromStart         | int(11)       | NO   |     |         |       |
#  | chromEnd           | int(11)       | NO   |     |         |       |
#  | loc_type           | tinyint(4)    | NO   |     |         |       |
#  | class              | varchar(255)  | NO   |     |         |       |
#  | orientation        | tinyint(4)    | NO   |     |         |       |
#  | allele             | blob          | YES  |     | NULL    |       |
#  | refUCSC            | blob          | YES  |     | NULL    |       |
#  | refUCSCReverseComp | blob          | YES  |     | NULL    |       |
#  | observed           | blob          | YES  |     | NULL    |       |
#  +--------------------+---------------+------+-----+---------+-------+

# generate exceptions for class and observed

# SingleClassBetweenLocType
# SingleClassRangeLocType
# NamedClassWrongLocType

# ObservedWrongFormat
# ObservedWrongSize 
# ObservedMismatch 

# RangeSubstitutionLocTypeExactMatch

# SingleClassTriAllelic
# SingleClassQuadAllelic

# This will also detect IUPAC symbols in allele

/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckClassAndObserved canFam1snp125

# add function
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpFunction canFam1snp125

# add validation status and heterozygosity
# log error if validation status > 31 or missing
# no errors this run
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSNP canFam1snp125

# add molType
# errors detected: missing or duplicate molType
# this run no errors
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpMoltype canFam1snp125

# generate chrN_snp125 and snp125Exceptions tables
cp snpCheckAlleles.exceptions snpCheckAlleles.tab
cp snpCheckClassAndObserved.exceptions snpCheckClassAndObserved.tab
cp snpExpandAllele.exceptions snpExpandAllele.tab
cp snpLocType.exceptions snpLocType.tab
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpFinalTable canFam1snp125 125

# concat into snp125.tab
# cat chr*_snp125.tab >> snp125.tab
/bin/sh concat.sh

# check for multiple alignments
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpMultiple canFam1snp125 125
mysql> load data local infile 'snpMultiple.tab' into table snp125Exceptions;

# load on hgwdev
cp snp125.tab /cluster/home/heather/transfer/snp/canFam1
hgsql canFam1snp125 -e 'select * from snp125Exceptions' > /cluster/home/heather/transfer/snp/canFam1/snp125Exceptions.tab
ssh hgwdev
cd transfer/snp/canFam1
mysql> load data local infile 'snp125.tab' into table snp125; 
mysql> load data local infile 'snp125Exceptions.tab' into table snp125Exceptions; 

# create indexes
mysql> alter table snp125 add index name (name);
mysql> alter table snp125 add index chrom (chrom, bin);
mysql> alter table snp125Exceptions add index name(name);

# create snp125ExceptionDesc table
cd /cluster/data/dbSNP
hgsql canFam1 < snp125ExceptionDesc.sql
# add counts to exception.human.125, can start with exception.template
hgsql -e 'select count(*), exception from snp125Exceptions group by exception' canfam1
mysql> load data local infile 'exception.canFam1.snp125' into table snp125ExceptionDesc;

mysql> select count(*), exception from snp125Exceptions group by exception;
+----------+---------------------------+
| count(*) | exception                 |
+----------+---------------------------+
|    97271 | MultipleAlignments        |
|     3687 | ObservedMismatch          |
|     1496 | ObservedWrongSize         |
|     2758 | RefAlleleNotRevComp       |
|     4102 | SingleClassBetweenLocType |
|       26 | SingleClassQuadAllelic    |
|      799 | SingleClassRangeLocType   |
|      359 | SingleClassTriAllelic     |
+----------+---------------------------+


#####################################################################
# SEGMENTAL DUPLICATIONS (DONE 10/16/06 angie)
    # File emailed from Ginger Cheng <ginger2@u.washington.edu>
    mkdir /cluster/data/canFam1/bed/genomicSuperDups
    cd /cluster/data/canFam1/bed/genomicSuperDups
    awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' canFam1_WGAC.tab \
    | hgLoadBed canFam1 genomicSuperDups stdin \
      -tab -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql


#####################################################################
# MICROSATELLITES (DONE 02/22/08 kuhn)

cd /cluster/data/canFam1/bed
mkdir microsat
cd microsat

awk '
($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 
  {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' 
     ../simpleRepeat/simpleRepeat.bed > microsat.bed
hgLoadBed canFam1 microsat microsat.bed
# {actually needed to catenate the whole  awk command to get it to work)


