# for emacs: -*- mode: sh; -*-


# This file describes how we made the browser database on 
# NCBI build 32 (March, 2003 freeze)

# [For importing GTF tracks, use /projects/compbio/bin/validate_gtf.pl]

# HOW TO BUILD A ASSEMBLY FROM NCBI FILES
# ---------------------------------------

# NOTE: It is best to run most of this stuff on eieio since it
# is not averse to handling files > 2Gb

# 0) Make gs.15 directory, gs.15/build32 directory, and gs.15/ffa directory.
    mkdir /cluster/store5/gs.15
    mkdir /cluster/store5/gs.15/build32
    mkdir /cluster/store5/gs.15/agp
    mkdir /cluster/store5/gs.15/ffa

#    Make a symbolic link from /cluster/store1 to this location
	
    cd /cluster/store1
    ln -s /cluster/store5/gs.15 ./gs.15

#    Make a symbolic link from your home directory to the build dir:

    ln -s /cluster/store5/gs.15/build32 ~/oo

# 1) Download seq_contig.md, ncbi_build32.agp, contig_overlaps.agp 
# and contig fa file into gs.15/build32 directory. 

# Download all finished agp's and fa's into gs.15/agp

# Download sequence.inf and ncbi_build32.fa files into gs.15/ffa, and unzip
# ncbi_build32.fa.

# *** For build32, files split into reference.agp/reference.fa (main O&O), DR51.agp/DR51.fa,
#     and DR52.agp/DR52.fa. (alternate versions of MHC region).  These were concatenated 
#     to get the ncbi_build32.agp and ncbi_build32.fa

# 2) Sanity check things with 
    /cluster/bin/i386/checkYbr build32/ncbi_build32.agp ffa/ncbi_build32.fa \
      build32/seq_contig.md
#      report any errors back to Richa and Greg at NCBI.

# 3) Convert fa files into UCSC style fa files and place in "contigs" directory
#    inside the gs.15/build32 directory 

    cd build32
    mkdir contigs
    /cluster/bin/i386/faNcbiToUcsc -split -ntLast ../ffa/ncbi_build32.fa \
      contigs

# 3.1) Make a fake chrM contig
    cd ~/oo
    mkdir M
# copy in chrM.fa, chrM.agp and chrM.gl from previous version.
    mkdir M/NT_999999
    cp chrM.fa NT_999999/NT_999999.fa

# copied chrM.fa, chrM.agp, chrM.gl, chrM.trf.bed, lift directory, NT_999999/NT_999999.fa - not sure which ones we need

# 4) Create lift files (this will create chromosome directory structure) and inserts file

    /cluster/bin/scripts/createNcbiLifts seq_contig.md .

# 5) Create contig agp files (will create contig directory structure)
	
    /cluster/bin/scripts/createNcbiCtgAgp seq_contig.md ncbi_build32.agp .

# 5.1) Create contig gl files

    ~kent/bin/i386/agpToGl contig_overlaps.agp . -md=seq_contig.md

# 6) Create chromsome agp files

    /cluster/bin/scripts/createNcbiChrAgp .

# 6.1) Copy over jkStuff from previous build
    mkdir jkStuff
    cp /cluster/store1/gs.14/build31/jkStuff/*.sh jkStuff
    cp /cluster/store1/gs.14/build31/jkStuff/*.csh jkStuff
    cp /cluster/store1/gs.14/build31/jkStuff/*.gsub jkStuff        

# 6.2) Patch in size of chromosome Y into Y/lift/ordered.lft 
#      by grabbing it from the last line of Y/chrY.agp (not needed for build32)

# 6.3) Create chromosome gl files
  
    jkStuff/liftGl.sh contig.gl

# 7) Distribute contig .fa to appropriate directory (assumes all files
#    are in "contigs" directory).

    /cluster/bin/scripts/distNcbiCtgFa contigs .
    rm -r contigs

# 8) Reverse complement NT contig fa files that are flipped in the assembly
#    (uses faRc program)
# Not done for build32 because all contigs on + strand.  It should be this
# way for the rest of the assemblies

    /cluster/bin/scripts/revCompNcbiCtgFa seq_contig.md .

# (NOTE: STS placements may be done at this point before repeat masking and 
# using the .fa's on NFS for QC analysis - all other placements should be 
# done after repeat masking and distributing to cluster nodes)


# GET FRESH MRNA/EST AND REFSEQ SEQUENCE FROM GENBANK (DONE 03/15/03)
    # Run this just before the sequence gets here!  It's OK to work on 
    # this in parallel with Terry's steps above, or in parallel with 
    # RepeatMasker below, but DO NOT let this hold up RepeatMasker.  

    # This will create a genbank.134 directory containing compressed
    # GenBank flat files and a mrna.134 containing unpacked sequence
    # info and auxiliary info in a relatively easy to parse (.ra)
    # format.

    # Point your browser to ftp://ftp.ncbi.nih.gov/genbank and look at 
    # the README.genbank.  Figure out the current release number.  (134)
    lynx ftp://ftp.ncbi.nih.gov/genbank/README.genbank
    # Consider deleting one of the older genbank releases.  It's
    # good to at least keep one previous release though.

    # Where there is space make a new genbank directory.  Create a
    # symbolic link to it:
    ssh eieio
    mkdir /cluster/store5/genbank.134
    ln -s /cluster/store5/genbank.134 ~/genbank
    cd ~/genbank
    # ncftp is handy -- it does anonymous login; "prompt" command not needed.
    ncftp ftp.ncbi.nih.gov
      cd genbank
      mget gbpri* gbrod* gbv* gbsts* gbest* gbmam* gbinv* gbbct* gbhtc* gbpat* gbphg* gbpln*
      quit
    # This will take at least 2 hours.

    # Make the refSeq subdir and download files:
    ssh eieio
    mkdir -p /cluster/store5/mrna.134/refSeq
    cd /cluster/store5/mrna.134/refSeq
    ncftp ftp.ncbi.nih.gov
      cd refseq/cumulative
      mget *.Z
      quit
    # Get extra info & human proteins from NCBI:
    wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/loc2ref
    wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/mim2loc
    wget ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/hs.faa.gz
    gunzip hs.faa.gz
    # Unpack this into species-specific fa files and get extra info with:
    cd /cluster/store5/mrna.134/refSeq
    cp /cluster/store2/mrna.133/*.fil ..
    gunzip -c rscu.gbff.Z \
    | gbToFaRa -byOrganism=org ../anyRna.fil refSeq.{fa,ra,ta} stdin

    # Now unpack and organize the larger genbank mrna/est sequences...
    ssh eieio
    cd /cluster/store5/mrna.134
    # Make the RNAs for all organisms
    gunzip -c \
      /cluster/store5/genbank.134/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \
    | gbToFaRa -byOrganism=org anyRna.fil mrna.{fa,ra,ta} stdin
    # Make the ESTs for all organisms
    gunzip -c /cluster/store5/genbank.134/gbest*.gz \
    | gbToFaRa anyRna.fil est.{fa,ra,ta} stdin -byOrganism=org
    # Make the nonhuman RNAs
    gunzip -c \
      /cluster/store5/genbank.134/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \
    | gbToFaRa humanXenoRna.fil humanXenoRna.{fa,ra,ta} stdin
    # Make the nonMouse RNAs
    gunzip -c \
      /cluster/store5/genbank.134/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \
    | gbToFaRa mouseXenoRna.fil mouseXenoRna.{fa,ra,ta} stdin
    # Make the nonRat RNAs
    gunzip -c \
      /cluster/store5/genbank.134/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \
    | gbToFaRa ratXenoRna.fil ratXenoRna.{fa,ra,ta} stdin
    # Make the nonhuman ESTs
    gunzip -c /cluster/store5/genbank.134/gbest*.gz \
    | gbToFaRa humanXenoRna.fil humanXenoEst.{fa,ra,ta} stdin
    # Split the really large ones into smaller pieces for more efficient 
    # cluster runs.  
    mkdir humanXenoRnaSplit humanXenoEstSplit
    faSplit about humanXenoRna.fa 10000000 humanXenoRnaSplit/xenoRna
    faSplit about humanXenoEst.fa 70000000 humanXenoEstSplit/xenoEst
    cd org/Homo_sapiens
    mkdir estSplit
    faSplit about est.fa 250000000 estSplit/est
    # Distribute the files to /iscratch/i/ so they're all ready to be aligned.
    ssh kkr1u00
    mkdir -p /iscratch/i/mrna.134/Homo_sapiens
    cp -p /cluster/store5/mrna.134/refSeq/org/Homo_sapiens/refSeq.fa \
      /iscratch/i/mrna.134/Homo_sapiens/
    cp -p /cluster/store5/mrna.134/org/Homo_sapiens/mrna.fa \
      /iscratch/i/mrna.134/Homo_sapiens/
    cp -p /cluster/store5/mrna.134/org/Homo_sapiens/estSplit/*.fa \
      /iscratch/i/mrna.134/Homo_sapiens/
    cp -p /cluster/store5/mrna.134/humanXenoRnaSplit/*.fa \
      /iscratch/i/mrna.134/Homo_sapiens/
    cp -p /cluster/store5/mrna.134/humanXenoEstSplit/*.fa \
      /iscratch/i/mrna.134/Homo_sapiens/
    ~kent/bin/iSync


# REPEAT MASKING (DONE 03/17/03)
    # Split contigs, run RepeatMasker, lift results
    # Notes: 
    # * If there is a new version of RepeatMasker, build it and ask the admins 
    #   to binrsync it (kkstore:/scratch/hg/RepeatMasker/*).
    # * Contigs (*/NT_*/NT_*.fa) are split into 500kb chunks to make 
    #   RepeatMasker runs manageable on the cluster ==> results need lifting.
    # * For the NCBI assembly we repeat mask on the sensitive mode setting
    #   (RepeatMasker -s)

    #- Split contigs into 500kb chunks:
    ssh eieio
    cd ~/hg14
    foreach d ( ?{,?}/NT_* )
      cd $d
      set contig = $d:t
      faSplit size $contig.fa 500000 ${contig}_ -lift=$contig.lft \
        -maxN=500000
      cd ../..
    end

    #- Make the run directory and job list:
    cd ~/hg14
    mkdir RMRun
    rm -f RMRun/RMJobs
    touch RMRun/RMJobs
    foreach d ( ?{,?}/NT_* )
      foreach f ( /cluster/store5/gs.15/build32/$d/NT_*_*.fa )
        set f = $f:t
        echo /cluster/bin/scripts/RMLocalSens \
             /cluster/store5/gs.15/build32/$d $f \
            '{'check out line+ /cluster/store5/gs.15/build32/$d/$f.out'}' \
          >> RMRun/RMJobs
      end
    end

    #- Do the run
    ssh kk
    cd ~/hg14/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...
    #- Now while that's running, run TRF (simpleRepeat), and RefSeq 
    #- alignments, in parallel.  Also, create the database and the 
    #- tracks that don't rely on cluster runs or on masked sequence.  

    #- Lift up the split-contig .out's to contig-level .out's
    ssh eieio
    cd ~/hg14
    foreach d ( ?{,?}/NT_* )
      cd $d
      set contig = $d:t
      liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null
      cd ../..
    end
    #- Lift up RepeatMask .out files to chromosome coordinates via
    tcsh jkStuff/liftOut2.sh
    #- By this point, the database should have been created (below):
    ssh hgwdev
    cd ~/hg14
    hgLoadOut hg14 ?/*.fa.out ??/*.fa.out


# VERIFY REPEATMASKER RESULTS (DONE 03/18/03)

    # Run featureBits on hg14 and on a comparable genome build, and compare:
    ssh hgwdev
    featureBits hg14 rmsk
    # --> 1384772888 bases of 3060248386 (45.250%) in intersection

    featureBits hg13 rmsk
    # --> 1383216615 bases of 3070074689 (45.055%) in intersection

    # Validate the RepeatMasking by randomly selecting a few NT_*.fa files, 
    # manually repeat masking them and matching the .out files with the 
    # related part in the chromosome-level .out files.  For example:
    ssh kkr1u00
    # Pick arbitrary values of $chr and $nt and run these commands: 
    set chr = 1
    set nt  = NT_004321
    mkdir /tmp/RMTest/$nt
    cd /tmp/RMTest/$nt
    cp ~/hg14/$chr/$nt/$nt.fa .
    /scratch/hg/RepeatMasker/RepeatMasker -s $nt.fa
    # Compare $nt.fa.out against the original ~/hg14/$chr/$nt/$nt.fa.out 
    # and against the appropriate part of $chr/chr$chr.fa.out (use the coords 
    # for $nt given in seq_contig.md).  


# MAKE LIFTALL.LFT, NCBI.LFT (DONE 03/16/03)
    cd ~/hg14
    cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft
    # Create jkStuff/ncbi.lft for lifting stuff built with the NCBI assembly.
    # Note: this ncbi.lift will not lift floating contigs to chr_random coords,
    # but it will show the strand orientation of the floating contigs 
    # (grep for '|').
    mdToNcbiLift seq_contig.md jkStuff/ncbi.lft 
    # If a lift file has been edited (e.g. as in 6.2.5 above), edit ncbi.lft 
    # to match.


# SIMPLE REPEAT [TRF] TRACK (DONE 03/16/03)
    # Distribute contigs to /iscratch/i
    ssh kkr1u00
    rm -rf /iscratch/i/gs.15/build32/contigs
    mkdir -p /iscratch/i/gs.15/build32/contigs
    cd ~/hg14
    foreach c (?{,?})
      echo copying contigs of chr$c
      cp -p $c/NT_*/NT_??????.fa /iscratch/i/gs.15/build32/contigs
    end
    # Make sure the total size looks like what you'd expect:
    du -sh /iscratch/i/gs.15/build32/contigs
    ~kent/bin/iSync

    # Create cluster parasol job like so:
    ssh kk
    mkdir -p ~/hg14/bed/simpleRepeat
    cd ~/hg14/bed/simpleRepeat
    cp ~/hg13/bed/simpleRepeat/gsub .
    mkdir trf
    ls -1S /iscratch/i/gs.15/build32/contigs/*.fa > genome.lst
    echo "" > dummy.lst
    gensub2 genome.lst dummy.lst gsub spec
    para create spec
    para try
    para check
    para push
    para check
    # When cluster run is done
    liftUp simpleRepeat.bed ~/hg14/jkStuff/liftAll.lft warn trf/*.bed

    # Load into the database:
    ssh hgwdev
    cd ~/hg14/bed/simpleRepeat
    hgLoadBed hg14 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql


# REFSEQ ALIGNMENTS AND REFGENE TRACK PREP (DONE 03/16/03)
    # Make sure contigs have been distributed to /iscratch/i/ (should have 
    # been done for simpleRepeat/TRF above)
    # Make sure refSeq.fa is under /iscratch/i too (GENBANK above)
    ssh kk
    mkdir ~/hg14/bed/refSeq
    cd ~/hg14/bed/refSeq
    mkdir psl
    ls -1S /iscratch/i/gs.15/build32/contigs/*.fa > genome.lst
    ls -1 /iscratch/i/mrna.134/Homo_sapiens/refSeq.fa > mrna.lst
    cp ~/hg13/bed/refSeq/gsub .
    gensub2 genome.lst mrna.lst gsub spec
    para create spec
    para try, para check, para push, para check....
    para time > time
    # When cluster is done, process refSeq alignments into near best in genome.
    ssh eieio
    cd ~/hg14/bed/refSeq
    pslSort dirs raw.psl /tmp psl
    pslReps -minCover=0.2 -sizeMatters -minAli=0.98 -nearTop=0.002 raw.psl \
      contig.psl /dev/null
    liftUp -nohead all_refSeq.psl ../../jkStuff/liftAll.lft carry contig.psl
    pslSortAcc nohead chrom /tmp all_refSeq.psl
    pslCat -dir chrom > refSeqAli.psl
    # After the database has been created, go to "LOAD REFGENE" below...


# PROCESS SIMPLE REPEATS INTO MASK (DONE 03/16/03)
    # After the simpleRepeats track has been built, make a filtered version 
    # of the trf output: keep trf's with period <= 12:
    ssh eieio
    cd ~/hg14/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/NT_*.bed)
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
    # Lift up filtered trf output to chrom coords as well:
    cd ~/hg14
    mkdir -p bed/simpleRepeat/trfMaskChrom
    foreach c (?{,?})
      if (-e $c/lift/ordered.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
          $c/lift/ordered.lst > $c/lift/oTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
      endif
      if (-e $c/lift/random.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/random.lst > $c/lift/rTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
      endif
    end


# MASK SEQUENCE WITH BOTH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 03/17/03)
    # This used to be done right after RepeatMasking.  Now, we mask with 
    # TRF as well, so do this after the "PROCESS SIMPLE REPEATS" step above.
    ssh eieio
    cd ~/hg14
    #- Soft-mask (lower-case) the contig and chr .fa's
    ./jkStuff/makeFaMasked.sh
    #- Make hard-masked .fa.masked files as well:
    ./jkStuff/makeHardMasked.sh
    #- Rebuild the nib, mixedNib, maskedNib files:
    ./jkStuff/makeNib.sh
    # Copy the masked contig fa to /iscratch and /scratch:
    ssk kkr1u00
    rm -rf /iscratch/i/gs.15/build32/trfFa
    mkdir -p /iscratch/i/gs.15/build32/trfFa
    cp -p ~/hg14/?{,?}/NT_*/NT_??????.fa /iscratch/i/gs.15/build32/trfFa
    ~kent/bin/iSync
    ssh kkstore
    rm -rf /scratch/hg/gs.15/build32/trfFa
    mkdir -p /scratch/hg/gs.15/build32/trfFa
    cp -p ~/hg14/?{,?}/NT_*/NT_??????.fa /scratch/hg/gs.15/build32/trfFa


# PREPARE CLUSTER FOR BLASTZ RUN (DONE 03/17/03)
    # This needs to be done after trf-masking and nib generation.
    ssh eieio
    # Extract lineage-specific repeats using Arian Smit's script:
    mkdir -p ~/hg14/bed/linSpecRep
    cd ~/hg14/bed/linSpecRep
    foreach f (~/hg14/*/*.out)
        ln -sf $f .
    end
    /cluster/bin/scripts/primateSpecificRepeats.pl *.out
    /cluster/bin/scripts/perl-rename 's/(\.fa|\.nib)//' *.out.*spec
    /cluster/bin/scripts/perl-rename 's/\.(rod|prim)spec/.spec/' *.out.*spec
    rm *.out
    # Copy files to the kkstore:/scratch
    ssh kkstore
    # lineage-specific repeats:
    cd ~/hg14/bed
    mkdir -p /scratch/hg/gs.15/build32
    rm -rf /scratch/hg/gs.15/build32/linSpecRep
    cp -Rp linSpecRep /scratch/hg/gs.15/build32
    # RepeatMasker .out:
    cd ~/hg14
    rm -rf /scratch/hg/gs.15/build32/rmsk
    mkdir -p /scratch/hg/gs.15/build32/rmsk
    cp -p ?{,?}/chr?{,?}{,_random}.fa.out /scratch/hg/gs.15/build32/rmsk
    # Chrom-level mixed nibs that have been repeat- and trf-masked:
    rm -rf /scratch/hg/gs.15/build32/chromTrfMixedNib
    mkdir -p /scratch/hg/gs.15/build32/chromTrfMixedNib
    cp -p mixedNib/chr*.nib /scratch/hg/gs.15/build32/chromTrfMixedNib
    # Ask cluster-admin@cse.ucsc.edu to binrsync /scratch/hg to clusters
    # Copy to /iscratch as well so we can run blastz before binrsync finishes:
    rm -rf /iscratch/i/gs.15/build32/{linSpecRep,rmsk,chromTrfMixedNib}
    cp -Rp /scratch/hg/gs.15/build32/{linSpecRep,rmsk,chromTrfMixedNib} \
      /iscratch/i/gs.15/build32/
    ssh kkr1u00
    ~kent/bin/iSync

    # Jim's comments Feb 12 '03 about the order in which to run blastz:
    # In general we should do
    # 1) hg/mm
    # 2) mm/rn
    # 3) rn/hg
    # 4) hg/hg
    # 5) mm/mm
    # 6) rn/rn
    # There is now an 'axtSwap' program that might let us
    # get out of having to run the inverse of 1,2 & 3,  though
    # 2 in particular is so fast perhaps it's just as well to
    # do the inverse explicitly.


# MAKE DOWNLOADABLE SEQUENCE FILES (DONE 03/20/03)
    ssh eieio
    cd ~/hg14
    #- Build the .zip files
    ./jkStuff/zipAll.sh |& tee zipAll.log
    #- Look at zipAll.log to make sure all file lists look reasonable.  
    #- Check zip file integrity:
    mkdir zip
    mv *.zip* zip
    cd zip
    foreach f (*.zip)
      unzip -t $f > $f.test
      tail -1 $f.test
    end
    wc -l *.zip.test
    #- Copy the .zip files to hgwdev:/usr/local/apache/...
    ssh hgwdev
    cd ~/hg14/zip
    # Edit cpToWeb.sh to contain the correct destination path.
    ../jkStuff/cpToWeb.sh
    cd /usr/local/apache/htdocs/goldenPath/10mar2003
    #- Take a look at bigZips/* and chromosomes/*, update their README.txt's


# CREATING DATABASE  (DONE 03/16/03)
    ssh hgwdev
    # if you haven't already:
    ln -s /cluster/store5/gs.15/build32 ~/oo
    ln -s /cluster/store5/gs.15/build32 ~/hg14
    # Make sure there is at least 5 gig free on hgwdev:/var/lib/mysql
    df -h /var/lib/mysql
    # Create the database.
    echo 'create database hg14' | hgsql hg13
    # make a semi-permanent read-only alias (add this to your .cshrc/.bashrc):
        alias hg14 mysql -u hguser -phguserstuff -A hg14
    # Initialize the relational-mrna and external sequence info tables:
    hgLoadRna new hg14
    # Copy over grp table (for track grouping) from another database:
    echo "create table grp (PRIMARY KEY(NAME)) select * from hg13.grp" \
    | hgsql hg14


# SEQUENCE INFO: CHROMINFO (DONE 03/16/03)
    ssh eieio
    cd ~/hg14
    # Sanity-check */lift/ordered.lft length vs. agp length:
    foreach c ( ?{,?} )
      if (-e $c/lift/ordered.lst) then
        set lftLen = `tail -1 $c/lift/ordered.lft | awk '{print $5;}'`
        set agpLen = `tail -1 $c/chr$c.agp | awk '{print $3;}'`
        if ($lftLen != $agpLen) then
          echo "ERROR: chr$c : lftLen=$lftLen, agpLen=$agpLen"
        else
          echo "chr$c : $lftLen"
        endif
      endif
    end
    # Make chr*.fa from contig .fa
    tcsh jkStuff/chrFa.sh
    # Make unmasked nibs -- necessary for building chromInfo.
    mkdir nib
    foreach f (?{,?}/chr?{,?}{,_random}.fa)
      echo making unmasked nib for $f
      faToNib $f nib/$f:t:r.nib
    end
    # Make symbolic links from /gbdb/hg14/nib to the real nibs.
    ssh hgwdev
    mkdir -p /gbdb/hg14/nib
    foreach f (/cluster/store5/gs.15/build32/nib/chr*.nib)
      ln -s $f /gbdb/hg14/nib
    end
    # Load /gbdb/hg14/nib paths into database and save size info.
    hgsql hg14  < ~/src/hg/lib/chromInfo.sql
    cd ~/hg14
    hgNibSeq -preMadeNib hg14 /gbdb/hg14/nib ?{,?}/chr?{,?}{,_random}.fa
    echo "select chrom,size from chromInfo" | hgsql -N hg14 > chrom.sizes


# O+O: ASSEMBLY [GOLD], GAP, COVERAGE, MAP CONTIGS TRACKS (DONE 03/16/03)
    # Store o+o info in database.
    # Note: for build31, Terry specially requested these files from NCBI:
    # finished.finf
    # draft.finf
    # predraft.finf
    # extras.finf
    ssh eieio
    cd /cluster/store5/gs.15/build32
    if (-f contig_overlaps.agp) then
      jkStuff/liftGl.sh contig.gl
    else
      hgGoldGapGl -noGl hg14 /cluster/store5/gs.15 build32 
      echo ""
      echo "*** Note from makeHg14.doc:"
      echo "Come back to this step later when we have contig_overlaps.agp\!"
    endif
    ssh hgwdev
    cd /cluster/store5/gs.15/build32
    if (-f contig_overlaps.agp) then
      hgGoldGapGl hg14 /cluster/store5/gs.15 build32 
      cd /cluster/store5/gs.15
      hgClonePos hg14 build32 ffa/sequence.inf /cluster/store5/gs.15 -maxErr=3
    end
    cd /cluster/store5/gs.15
    hgCtgPos hg14 build32 


# LOAD REFGENE (DONE 03/16/03)
    # Do this after the database has been created and the RefSeq alignments 
    # are done (above)
    # Load refSeq alignments into database
    ssh hgwdev
    cd ~/hg14/bed/refSeq
    hgLoadPsl hg14 -tNameIx refSeqAli.psl
    # Make /gbdb symlinks for refSeq.fa (not .ra)
    mkdir -p /gbdb/hg14/mrna.134
    cd /gbdb/hg14/mrna.134
    ln -s /cluster/store5/mrna.134/refSeq/org/Homo_sapiens/refSeq.fa
    # Load the refSeq mRNA
    cd /cluster/store2/tmp
    hgLoadRna add -type=refSeq hg14 /gbdb/hg14/mrna.134/refSeq.fa \
      /cluster/store5/mrna.134/refSeq/org/Homo_sapiens/refSeq.ra
    cd ~/hg14/bed/refSeq
    hgRefSeqMrna hg14 /gbdb/hg14/mrna.134/refSeq.fa \
      /cluster/store5/mrna.134/refSeq/org/Homo_sapiens/refSeq.ra \
      all_refSeq.psl \
      /cluster/store5/mrna.134/refSeq/loc2ref \
      /cluster/store5/mrna.134/refSeq/hs.faa \
      /cluster/store5/mrna.134/refSeq/mim2loc
    # Don't worry about the "No gene name" errors
    # Add RefSeq status info
    hgRefSeqStatus -human hg14 /cluster/store5/mrna.134/refSeq/loc2ref
    # Create precomputed join of refFlat and refGene:
    echo 'CREATE TABLE refFlat \
          (KEY geneName (geneName), KEY name (name), KEY chrom (chrom)) \
          SELECT refLink.name as geneName, refGene.* \
          FROM refLink,refGene \
          WHERE refLink.mrnaAcc = refGene.name' \
    | hgsql hg14


# GC PERCENT (DONE 03/16/03)
     ssh hgwdev
     mkdir -p ~/hg14/bed/gcPercent
     cd ~/hg14/bed/gcPercent
     hgsql hg14  < ~/src/hg/lib/gcPercent.sql
     hgGcPercent hg14 ../../nib


# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 03/16/03)
    ssh hgwdev
    # Enter hg14 into hgcentraltest.dbDb so test browser knows about it:
    echo 'insert into dbDb values("hg14", "Human Mar. 2003", \
            "/gbdb/hg14/nib", "Human", "DUSP18", 1, 80, "Human");' \
    | hgsql -h genome-testdb hgcentraltest
    # Make trackDb table so browser knows what tracks to expect:
    cd ~/src/hg/makeDb/trackDb
    cvs up -d -P .
    # Edit that makefile to add hg14 in all the right places and do
    make update
    make alpha
    cvs commit makefile


# PRELOAD MRNA/EST SEQUENCE INFO INTO DATABASE (DONE 03/16/03)
    # Make /gbdb symlinks for sequence .fa (not .ra)
    mkdir -p /gbdb/hg14/mrna.134
    cd /gbdb/hg14/mrna.134
    ln -s /cluster/store5/mrna.134/org/Homo_sapiens/mrna.fa
    ln -s /cluster/store5/mrna.134/org/Homo_sapiens/est.fa
    ln -s /cluster/store5/mrna.134/humanXenoRna.fa
    ln -s /cluster/store5/mrna.134/humanXenoEst.fa
    # Store the sequence (non-alignment) info in database.
    cd /cluster/store2/tmp
    hgLoadRna add -type=mRNA hg14 /gbdb/hg14/mrna.134/mrna.fa \
      /cluster/store5/mrna.134/org/Homo_sapiens/mrna.ra
    hgLoadRna add -type=EST hg14 /gbdb/hg14/mrna.134/est.fa \
      /cluster/store5/mrna.134/org/Homo_sapiens/est.ra
    hgLoadRna add -type=xenoRna hg14 /gbdb/hg14/mrna.134/humanXenoRna.fa \
      /cluster/store5/mrna.134/humanXenoRna.ra
    hgLoadRna add -type=xenoEst hg14 /gbdb/hg14/mrna.134/humanXenoEst.fa \
      /cluster/store5/mrna.134/humanXenoEst.ra


# MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 03/20/03)
    ssh hgwdev
    # Substitute BBB with the correct number for the hostname:
    echo 'insert into blatServers values("hg14", "blat11", "17778", "1"); \
          insert into blatServers values("hg14", "blat11", "17779", "0");' \
    | hgsql -h genome-testdb hgcentraltest


# MAKING AND STORING mRNA AND EST ALIGNMENTS (DONE 03/18/03)
    # Make sure that /scratch/hg/gs.15/build32/trfFa is loaded with NT_*.fa 
    # and has been pushed to the big cluster nodes.  (MASK SEQUENCE above)
    # Make sure mrna/est .fa's are under /iscratch/i too (GENBANK above)
    ssh kk
    mkdir -p ~/hg14/bed/{mrna,est}/psl
    cd ~/hg14/bed/mrna
    ls -1S /scratch/hg/gs.15/build32/trfFa/* > genome.lst
    ls -1S /iscratch/i/mrna.134/Homo_sapiens/mrna.fa > mrna.lst
    cp ~/hg13/bed/mrna/gsub .
    gensub2 genome.lst mrna.lst gsub spec
    para create spec
    para try
    cd ~/hg14/bed/est
    ls -1S /scratch/hg/gs.15/build32/trfFa/* > genome.lst
    ls -1S /iscratch/i/mrna.134/Homo_sapiens/est*.fa > mrna.lst
    # Using split est fa -- so create separate output dirs and special gsub:
    foreach f (`cat mrna.lst`)
      mkdir psl/$f:t:r
    end
    echo '#LOOP \
/cluster/home/kent/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -ooc={check in exists /scratch/hg/h/11.ooc} {check out line+ psl/$(root2)/$(root1)_$(root2).psl} \
#ENDLOOP' > gsub
    gensub2 genome.lst mrna.lst gsub spec
    para create spec
    para try
    # In each dir (~/hg14/bed/mrna, ~/hg14/bed/est):
    para check, para push, para check....
    # para time > time
      
    # Process mRNA and EST alignments into near best in genome.
    cd ~/hg14/bed/mrna
    pslSort dirs raw.psl /tmp psl
    pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl \
      /dev/null
    liftUp -nohead all_mrna.psl ../../jkStuff/liftAll.lft carry contig.psl
    pslSortAcc nohead chrom /tmp all_mrna.psl

    cd ~/hg14/bed/est
    pslSort dirs raw.psl /cluster/store2/tmp psl/est*
    pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl \
      /dev/null
    liftUp -nohead all_est.psl ../../jkStuff/liftAll.lft carry contig.psl
    pslSortAcc nohead chrom /cluster/store3/tmp all_est.psl

    # Load mRNA alignments into database. 
    ssh hgwdev
    cd ~/hg14/bed/mrna/chrom
    rm -f *_mrna.psl
    foreach i (*.psl)
      mv $i $i:r_mrna.psl
    end
    hgLoadPsl hg14 *.psl
    cd ..
    hgLoadPsl hg14 all_mrna.psl -nobin

    # Load EST alignments into database.
    ssh hgwdev
    cd ~/hg14/bed/est/chrom
    rm -f *_est.psl
    foreach i (*.psl)
      mv $i $i:r_est.psl
    end
    hgLoadPsl hg14 *.psl
    cd ..
    hgLoadPsl hg14 all_est.psl -nobin
    # Sequence info should have already been loaded into database (PRELOAD)


# SPLICED ESTS (INTRONEST) (DONE 03/18/03)
    # Create subset of ESTs with introns and load into database.
    ssh eieio
    cd ~/hg14
    tcsh jkStuff/makeIntronEst.sh
    ssh hgwdev
    cd ~/hg14/bed/est/intronEst
    hgLoadPsl hg14 *.psl


# ESTORIENTINFO, MRNAORIENTINFO, GENE BOUNDS (RNACLUSTER) (DONE 03/22/03)
    # Put orientation info on ESTs and mRNAs into database:
    ssh eieio
    cd ~/hg14/bed/est
    pslSortAcc nohead contig /cluster/store3/tmp contig.psl
    cd ~/hg14/bed/mrna
    pslSortAcc nohead contig /cluster/store3/tmp contig.psl
    # Distribute the est and mrna psl files to /iscratch/i
    ssh kkr1u00
    rm -rf /iscratch/i/gs.15/build32/bed
    mkdir -p /iscratch/i/gs.15/build32/bed
    cp -r ~/hg14/bed/est/contig /iscratch/i/gs.15/build32/bed/est
    cp -r ~/hg14/bed/mrna/contig /iscratch/i/gs.15/build32/bed/mrna
    ~kent/bin/iSync
    # mrna: use big cluster.
    ssh kk
    mkdir -p ~/hg14/bed/mrnaOrientInfo/oi
    cd ~/hg14/bed/mrnaOrientInfo
    ls -1S /iscratch/i/gs.15/build32/bed/mrna/*.psl > psl.lst
    ls -1S /iscratch/i/mrna.134/Homo_sapiens/mrna*.fa > mrna.lst
    cp ~/hg13/bed/mrnaOrientInfo/gsub .
    # Edit gsub to point to the correct paths.
    gensub2 psl.lst mrna.lst gsub spec
    para create spec
    para try 
    para check, para push, para check, ....
    # When the cluster run is done do:
    ssh hgwdev
    cd ~/hg14/bed/mrnaOrientInfo
    liftUp mrnaOrientInfo.bed ~/hg14/jkStuff/liftAll.lft warn oi/*.tab
    hgLoadBed hg14 mrnaOrientInfo mrnaOrientInfo.bed \
      -sqlTable=$HOME/kent/src/hg/lib/mrnaOrientInfo.sql > /dev/null

    # est: use small cluster (I/O intensive).  Use 2-level output dir 
    # (input est.fa has been split into multiple files).  
    ssh kkr1u00
    mkdir -p ~/hg14/bed/estOrientInfo/oi
    cd ~/hg14/bed/estOrientInfo
    foreach f (`cat mrna.lst`)
      mkdir oi/$f:t:r
    end
    ls -1S /iscratch/i/gs.15/build32/bed/est/*.psl > psl.lst
    ls -1S /iscratch/i/est.134/Homo_sapiens/est*.fa > mrna.lst
    cp ~/hg13/bed/estOrientInfo/gsub .
    # Edit gsub to point to the correct paths.
    gensub2 psl.lst mrna.lst gsub spec
    para create spec
    para try 
    para check, para push, para check, ....
    # When the cluster run is done do:
    ssh hgwdev
    cd ~/hg14/bed/estOrientInfo
    # oi/*/*.tab -> argument list too long... so cat the lowest level together:
    foreach d (oi/*)
      cat $d/*.tab > $d.tab
    end
    liftUp estOrientInfo.bed ~/hg14/jkStuff/liftAll.lft warn oi/*.tab
    bedSort estOrientInfo.bed estOrientInfo.bed
    hgLoadBed hg14 estOrientInfo estOrientInfo.bed \
      -sqlTable=$HOME/kent/src/hg/lib/estOrientInfo.sql > /dev/null

    # Create rnaCluster table (depends on {est,mrna}OrientInfo above)
    cd ~/hg14
    # Create a list of accessions that come from RAGE libraries and need to
    # be excluded. (added by Chuck Wed Nov 27 13:09:07 PST 2002)
    ~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg14 \
      rage.libs
    mkdir -p ~/hg14/bed/rnaCluster/chrom
    # Exclude accesions in the RAGE file
    foreach f (?{,?}/chr*.fa)
      set c = $f:t:r
      set out = bed/rnaCluster/chrom/$c.bed
      echo clusterRna -mrnaExclude=hg14.rage.libs hg14 /dev/null $out -chrom=$c
      clusterRna -mrnaExclude=hg14.rage.libs hg14 /dev/null $out -chrom=$c
    end
    cd bed/rnaCluster
    hgLoadBed hg14 rnaCluster chrom/*.bed > /dev/null


# GENEBANDS (DONE 03/18/03)
    # Create precomputed geneBands table:
    ssh hgwdev
    hgGeneBands hg14 geneBands.txt
    hgsql hg14 < ~/kent/src/hg/lib/geneBands.sql
    echo "load data local infile 'geneBands.txt' into table geneBands;" \
    | hgsql hg14
    rm geneBands.txt


# PRODUCING GENSCAN PREDICTIONS (DONE 03/23/03)

    ssh eieio
    mkdir -p ~/hg14/bed/genscan
    cd ~/hg14/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir -p gtf pep subopt
    # Generate a list file, genome.list, of all the contigs
    # *that do not have pure Ns* (due to heterochromatin, unsequencable 
    # stuff) which would cause genscan to run forever.
    rm -f genome.list
    touch genome.list
    foreach f ( `ls -1S /cluster/store5/gs.15/build32/?{,?}/NT_*/NT_??????.fa.masked` )
      egrep '[ACGT]' $f > /dev/null
      if ($status == 0) echo $f >> genome.list
    end
        
    # Log into kkr1u00 (not kk!).  kkr1u00 is the driver node for the small
    # cluster (kkr2u00 -kkr8u00. Genscan has problem running on the
    # big cluster, due to limitation of memory and swap space on each
    # processing node).
    ssh kkr1u00
    # Create template file, gsub, for gensub2.  For example (3-line file):
#LOOP    rm -f genome.list
/cluster/home/kent/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
    echo "" > dummy.list
    gensub2 genome.list dummy.list gsub jobList
    para create jobList
    para try
    para check
    para push
    # Issue either one of the following two commands to check the
    # status of the cluster and your jobs, until they are done.
    parasol status
    para check
    # If there were out-of-memory problems (run "para problems"), then 
    # re-run those jobs by hand but change the -window arg from 2400000
    # to 1200000.  In build32, this was 22/NT_011519.

    # Convert these to chromosome level files as so:     
    ssh eieio
    cd ~/hg14/bed/genscan
    liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/NT*.gtf
    liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/NT*.bed > \
      /dev/null
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd ~/hg14/bed/genscan
    ldHgGene hg14 genscan genscan.gtf
    hgPepPred hg14 generic genscanPep genscan.pep
    hgLoadBed hg14 genscanSubopt genscanSubopt.bed > /dev/null


# CPGISLANDS (DONE 03/17/03)
    ssh eieio
    mkdir -p ~/hg14/bed/cpgIsland
    cd ~/hg14/bed/cpgIsland
    # Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu)
    # copy the tar file to the current directory
    tar xvf cpg_dist.tar 
    cd cpg_dist
    gcc readseq.c cpg_lh.c -o cpglh.exe
    cd ..
    # cpglh.exe requires hard-masked (N) .fa's.  
    # There may be warnings about "bad character" for IUPAC ambiguous 
    # characters like R, S, etc.  Ignore the warnings.  
    foreach f (../../?{,?}/chr?{,?}{,_random}.fa.masked)
      set fout=$f:t:r:r.cpg
      echo producing $fout...
      ./cpg_dist/cpglh.exe $f > $fout
    end
    cp ~/hg13/bed/cpgIsland/filter.awk .
    awk -f filter.awk chr*.cpg > cpgIsland.bed
    ssh hgwdev
    cd ~/hg14/bed/cpgIsland
    hgLoadBed hg14 cpgIsland -tab -noBin \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed


CREATE GOLDEN TRIANGLE (todo)

# Make sure that rnaCluster table is in place.   Then extract Affy 
# expression info into a form suitable for Eisen's clustering program with:
      cd ~/hg14/bed
      mkdir triangle
      cd triangle
      eisenInput hg14 affyHg10.txt
Transfer this to Windows and do k-means clustering
with k=200 with cluster.  Transfer results file back
to ~/hg14/bed/triangle/affyCluster_K_G200.kgg.  Then
do
      promoSeqFromCluster hg14 1000 affyCluster_K_G200.kgg kg200.unmasked
Then RepeatMask the .fa file inkg200.unmasked, and copy masked versions
to kg200.   Then
      cat kg200/*.fa > all1000.fa
and set up cluster Improbizer run to do 100 controls for every real
run on each - putting the output in imp.200.1000.e.  When improbizer
run is done make a file summarizing the runs as so:
      cd imp.200.1000.e
      motifSig ../imp.200.1000.e.iri ../kg200 motif control*
get rid of insignificant motifs with:
      cd ..
      awk '{if ($2 > $3) print; }' imp.200.1000.e.iri > sig.200.1000.e.iri
turn rest into just dnaMotifs with
      iriToDnaMotif sig.200.1000.e.iri motif.200.1000.e.txt
Extract all promoters with
      featureBits hg14 rnaCluster:upstream:1000 -bed=upstream1000.bed -fa=upstream1000.fa
Locate motifs on all promoters with
      dnaMotifFind motif.200.1000.e.txt upstream1000.fa hits.200.1000.e.txt -rc -markov=2
      liftPromoHits upstream1000.bed hits.200.1000.e.txt triangle.bed

CREATE STS/FISH/BACENDS/CYTOBANDS DIRECTORY STRUCTURE AND SETUP (DONE 3/15/2003)

o - Create directory structure to hold information for these tracks
        cd /projects/hg2/booch/psl/
        change Makefile parameters for OOVERS, GSVERS, PREVGS, PREVOO
        make new

o - Update all Makefiles with latest OOVERS and GSVERS, DATABASE, and locations of .fa files

o - Create accession_info file
        make accession_info.rdb

UPDATE STS INFORMATION (DONE 3/15/2003)

o - Download and unpack updated information from dbSTS:

        In a web browser, go to ftp://ftp.ncbi.nih.gov/repository/dbSTS/.  Download 
            dbSTS.sts, dbSTS.aliases, and dbSTS.FASTA.dailydump.Z to 
            /projects/hg2/booch/psl/update

        -Unpack dbSTS.FASTA.dailydump.Z
        gunzip dbSTS.FASTA.dailydump.Z

o - Create updated files
        cd /projects/hg2/booch/psl/update
        edit Makefile to latest sts.X version from PREV (currently sts.4)
        make update

o - Make new directory for this info and move files there
        ssh kks00
        mkdir /cluster/store1/sts.5
        cp all.STS.fa /cluster/store1/sts.5
        cp all.primers /cluster/store1/sts.5
        cp all.primers.fa /cluster/store1/sts.5

o - Copy new files to cluster
        ssh kkstore
        cd /cluster/store1/sts.5
        cp /cluster/store1/sts.5/*.* /scratch/hg/STS
        ask for propagation from sysadmin

STS ALIGNMENTS (DONE 3/19/2003)
(alignments done without RepeatMasking, so start ASAP!)

o - Create full sequence alignments
        ssh kk

        cd /cluster/home/booch/sts
        - update Makefile with latest OOVERS and GSVERS
        make new
        make jobList.scratch (if contig files propagated to nodes)
                - or _
        make jobList.disk (if contig files not propagated)
        para create jobList
        para push (or para try/para check if want to make sure it runs)
        make stsMarkers.psl

o - Copy files to final destination and remove originals
        ssh kks00
        make copy.assembly
        make clean.assembly

o - Create primer alignments
        ssh kk
        cd /cluster/home/booch/primers
        - update Makefile with latest OOVERS and GSVERS
        make new
        make jobList.scratch (if contig files propagated to nodes)
                - or _
        make jobList.disk (if contig files not propagated)
        para create jobList
        para push (or para try/para check if want to make sure it runs)
        make primers.psl

o - Copy files to final destination and remove
        ssh kks00
        make copy.assembly
        make clean.assembly
        
o - Create ePCR alignments
        ssh kk
        cd /cluster/home/booch/epcr
        - update Makefile with latest OOVERS and GSVERS
        make new
        make jobList.scratch (if contig files propagated to nodes)
                - or _
        make jobList.disk (if contig files not propagated)
        para create jobList
        para push (or para try/para check if want to make sure it runs)
        make primers.psl

o - Copy files to final destination and remove
        ssh kks00
        make copy.assembly
        make clean.assembly
        
CREATE AND LOAD STS MARKERS TRACK (DONE 3/19/2003)

o - Copy in current stsInfo2.bed and stsAlias.bed files
        cd /projects/hg2/booch/psl/gs.15/build32
        cp ../update/stsInfo2.bed .
        cp ../update/stsAlias.bed .

o - Create final version of sts sequence placements
        ssh kks00
        cd /projects/hg2/booch/psl/gs.15/build32/sts
        make stsMarkers.final

o - Create final version of primers placements
        cd /projects/hg2/booch/psl/gs.15/build32/primers
        cp /cluster/store1/sts.5/all.primers .
        make primers.final

o - Create bed file
        cd /projects/hg2/booch/psl/gs.15/build32
        make stsMap.bed

o - Create database tables
        ssh hgwdev
        cd /projects/hg2/booch/psl/tables
        hgsql hg14 < all_sts_primer.sql
        hgsql hg14 < all_sts_seq.sql
        hgsql hg14 < stsAlias.sql
        hgsql hg14 < stsInfo2.sql
        hgsql hg14 < stsMap.sql

o - Load the tables
        load /projects/hg2/booch/psl/gs.15/build32/sts/stsMarkers.psl.filter.lifted into all_sts_seq        
        load /projects/hg2/booch/psl/gs.15/build32/primers/primers.psl.filter.lifted into all_sts_primer        
        load /projects/hg2/booch/psl/gs.15/build32/stsAlias.bed into stsAlias
        load /projects/hg2/booch/psl/gs.15/build32/stsInfo2.bed into stsInfo2
      echo 'load data local infile "/projects/hg2/booch/psl/gs.15/build32/stsMap.bed" into table stsMap;' \
      | hgsql hg14

    # Load the sequences (change sts.# to match correct location)
    mkdir /gbdb/hg14/sts.6
    cd /gbdb/hg14/sts.6
    ln -s /cluster/store1/sts.6/all.STS.fa
    ln -s /cluster/store1/sts.6/all.primers.fa
    cd /cluster/store2/tmp
    hgLoadRna addSeq hg14 /gbdb/hg14/sts.6/all.STS.fa
    hgLoadRna addSeq hg14 /gbdb/hg14/sts.6/all.primers.fa


# UPDATE BACEND SEQUENCES (DONE 3/14/2003)

# 1) Download new files (not done cause no change for build32):
#    In a web browser, go to ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/BACENDS/.
#    Download BACends.fa.gz and cl_acc_gi_len_primer to 
#    /cluster/store1/bacends.3

# 2) Unpack AllBACends.fa.gz
        gunzip AllBACends.fa.gz

# 3) Create new pairs file
	/cse/grads/booch/compbio/booch/scripts/convertBacEndPairInfo cl_acc_gi_len_primer /cluster/store1/bacends.2/bacEndPairs.txt

# 4) Split file into pieces
	/cluster/bin/i386/faSplit sequence BACends.fa 100 BACends

# 5) Move files to cluster
        ssh kkstore
        cd /cluster/store1/bacends.3
        mv /cluster/store1/bacends3/BACends??.fa /scratch/hg/bacEnds/hs/

# 6) Ask for propagation from sysadmin


# BACEND SEQUENCE ALIGNMENTS (DONE 3/17/2003)
# (alignments done without RepeatMasking)

# 1) Create full sequence alignments
        ssh kk
        cd /cluster/home/booch/bacends
# update Makefile with latest OOVERS and GSVERS
        make new
        make jobList
        para create jobList
        para push 
        make bacEnds.psl

# 2) Lift the files (takes a while)
	make bacEnds.psl.lifted

# 3) Copy files to final destination and remove
        ssh kks00
        make copy
        make clean # (may want to wait until sure they're OK)

# BACEND PAIRS TRACK (DONE 3/18/2003)

   # 1) Update Makefile with OOVERS, GSVERS, location of pairs/singles 
   #    files, if necessary
        cd /projects/hg2/booch/psl/gs.15/build32/bacends
        # edit Makefile

   # 2) Create initial rdb file
	make bacEnds.rdb (# Takes a while

   # 3) Create file of singles to search for
	make bacEndPairsBad.bed

   # 4) Try to fish out more pairs
	make bacEndsMiss.psl

   # 5) Re-make bacEnds.rdb with new info
	make bacEnds.rdb
 
   # 6) Create bacEndPairs track file
        make bacEndPairs.bed

   # 7) Create bacEndPairsBad and bacEndPairsLong files
	make bacEndPairsBad.bed

   # 8) Create psl file to load
	make bacEnds.load.psl

   # 9) Create database tables
        ssh hgwdev
        cd /projects/hg2/booch/psl/tables
        hgsql hg14 < all_bacends.sql
        hgsql hg14 < bacEndPairs.sql
        hgsql hg14 < bacEndPairsBad.sql
        hgsql hg14 < bacEndPairsLong.sql

   # 10) Load the tables
        load /projects/hg2/booch/psl/gs.15/build32/bacends/bacEnds.psl.filter.lifted into all_bacends        
        load /projects/hg2/booch/psl/gs.15/build32/bacends/bacEndPairs.bed into bacEndPairs
        load /projects/hg2/booch/psl/gs.15/build32/bacends/bacEndPairsBad.bed into bacEndPairsBad
        load /projects/hg2/booch/psl/gs.15/build32/bacends/bacEndPairsLong.bed into bacEndPairsLong

   # 11) Load the sequences (change bacends.# to match correct location)
       mkdir /gbdb/hg15/bacends.3
       cd /gbdb/hg15/bacends.3
       ln -s /cluster/store1/bacends.3/BACends.fa
       cd /cluster/store2/tmp
       hgLoadRna addSeq hg15 /gbdb/hg15/bacends.3/BACends.fa
                
FOSEND SEQUENCE ALIGNMENTS (DONE 3/17/2003)

o - Create full sequence alignments
        ssh kk
        cd /cluster/home/booch/fosends
        - update Makefile with latest OOVERS and GSVERS
        make new
        make jobList
        para create jobList
        para push (or para try/para check if want to make sure it runs)
        make fosEnds.psl

o - Copy files to final destination and remove
        ssh kks00
        make copy.assembly
        make clean.assembly

FOSEND PAIRS TRACK (TODO)

o - Update Makefile with location of pairs files, if necessary
        cd /projects/hg2/booch/psl/gs.15/build32/fosends

o - Create bed file
        ssh kks00
        cd /projects/hg2/booch/psl/gs.15/build32/fosends
        make fosEndPairs.bed

o - Create database tables
        ssh hgwdev
        cd /projects/hg2/booch/psl/tables
        hgsql hg14 < all_fosends.sql
        hgsql hg14 < fosEndPairs.sql

o - Load the tables
        load /projects/hg2/booch/psl/gs.15/build32/fosends/fosEnds.psl.filter.lifted into all_fosends        
        load /projects/hg2/booch/psl/gs.15/build32/fosends/fosEndPairs.bed into fosEndPairs

    # Load the sequences (change bacends.# to match correct location)
    mkdir /gbdb/hg14/fosends.1
    cd /gbdb/hg14/fosends.1
    ln -s /cluster/store1/fosends.1/fosEnds.fa
    cd /cluster/store2/tmp
    hgLoadRna addSeq hg14 /cluster/store1/fosends.1/fosEnds.fa
                
UPDATE FISH CLONES INFORMATION (TODO)

o - Download the latest info from NCBI
        point browser at http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg
        change "Show details on sequence-tag" to "yes"
        change "Download or Display" to "Download table for UNIX"
        press Submit - save as /projects/hg2/booch/psl/fish/hbrc/hbrc.YYYYMMDD.table

o - Format file just downloaded
        cd /projects/hg2/booch/psl/fish/
        make HBRC

o - Copy it to the new freeze location
        cp /projects/hg2/booch/psl/fish/all.fish.format /projects/hg2/booch/psl/gs.15/build32/fish/


CREATE AND LOAD FISH CLONES TRACK (DONE 3/20/2003)
(must be done after STS markers track and BAC end pairs track)

o - Extract the file with clone positions from database
        ssh hgwdev
        hgsql hg14
        mysql>  select * into outfile "/tmp/booch/clonePos.txt" from clonePos;
        mysql> quit
        mv /tmp/booch/clonePos.txt /projects/hg2/booch/psl/gs.15/build32/fish

o - Create bed file
        cd /projects/hg2/booch/psl/gs.15/build32/fish
        make bed

o - Create database table
        ssh hgwdev
        cd /projects/hg2/booch/psl/tables
        hgsql hg14 < fishClones.sql

o - Load the table
        load /projects/hg2/booch/psl/gs.15/build32/fish/fishClones.bed into fishClones
        

CREATE AND LOAD CHROMOSOME BANDS TRACK (DONE 3/20/2003)
(must be done after FISH Clones track) 

o - Create bed file
        ssh hgwdev
        make setBands.txt
        make cytobands.pct.ranges
        make predict

o - Create database table
        ssh hgwdev
        cd /projects/hg2/booch/psl/tables
        hgsql hg14 < cytoBand.sql
        
o - Load the table
      echo "load data local infile '/projects/hg2/booch/psl/gs.15/build32/cytobands/cytobands.bed' into table cytoBand;"
      | hgsql hg14


CREATE CHROMOSOME REPORTS (NOT BEING DONE)


CREATE STS MAP COMPARISON PLOTS AND GENETIC PLOTS (NOT BEING DONE)

o - Must wait until after the STS Map track has been finished

o - Create sts plots
        cd /projects/hg2/booch/psl/gs.15/build32/stsPlots
        make stsplots

o - Create genetic plots
        cd /projects/hg2/booch/psl/gs.15/build32/geneticPlots
        make all
        matlab -nodesktop
        >> allplot_ncbi('/cse/grads/booch/tracks/gs.15/build32/geneticPlots/','build32', 'jpg');
        >> quit

o - Set up directories where this will end up
        ssh hgwdev
        cd /usr/local/apache/htdocs/goldenPath/mapPlots
        update Makefile with OOVERS, GSVERS, and FREEZE date
        make new

o - Copy over files
        make sts
        make genetic

o - Update the index.html to include links to these new plots, and delete oldest set
    Update the arch.html with the oldest set just removed from index.html
        *** Make sure to check into CVS ***


# PRODUCING CROSS_SPECIES mRNA ALIGNMENTS (TODO)
    # Make sure masked contigs are in /scratch/hg/gs.15/build32/trfFa 
    # Make sure split-up xenoRna sequence is under /iscratch too (GENBANK)
    ssh kkstore
    mkdir -p ~/hg14/bed/xenoMrna
    cd ~/hg14/bed/xenoMrna
    mkdir psl
    ls -1S /scratch/hg/gs.15/build32/trfFa/*.fa.trf > human.lst
    ls -1S /iscratch/i/mrna.134/Homo_sapiens/xenoRna*.fa > mrna.lst
    # Using split fa -- so create separate output dirs and special gsub:
    foreach f (`cat mrna.lst`)
      mkdir psl/$f:t:r
    end
    echo '#LOOP \
/cluster/home/kent/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -q=rnax -t=dnax -mask=lower {check out line+ psl/$(root2)/$(root1)_$(root2).psl} \
#ENDLOOP' > gsub
    gensub2 human.lst mrna.lst gsub spec
    para create spec
    ssh kk
    cd ~/hg14/bed/xenoMrna
    para try
    para check
    para push 
    # Do para check until the run is done, doing para push if necessary
    # Sort xeno mRNA alignments as so:
    ssh eieio
    cd ~/hg14/bed/xenoMrna
    pslSort dirs raw.psl /cluster/store2/temp psl/xenoRna*
    pslReps raw.psl cooked.psl /dev/null -minAli=0.25
    liftUp chrom.psl ../../jkStuff/liftAll.lft warn cooked.psl
    pslSortAcc nohead chrom /cluster/store2/temp chrom.psl
    pslCat -dir chrom > xenoMrna.psl
    rm -r chrom raw.psl cooked.psl chrom.psl
    # Load into database as so:
    ssh hgwdev
    cd ~/hg14/bed/xenoMrna
    hgLoadPsl hg14 xenoMrna.psl -tNameIx
    # Sequence info should have already been loaded into database (PRELOAD)


# PRODUCING CROSS_SPECIES EST ALIGNMENTS (TODO)
    # Make sure masked contigs are in /scratch/hg/gs.15/build32/trfFa 
    # Make sure split-up xenoRna sequence is under /iscratch too (GENBANK)
    ssh kkstore
    mkdir -p ~/hg14/bed/xenoEst
    cd ~/hg14/bed/xenoEst
    mkdir psl
    ls -1S /scratch/hg/gs.15/build32/trfFa/*.fa.trf > human.lst
    ls -1S /iscratch/i/mrna.134/Homo_sapiens/xenoEst*.fa > mrna.lst
    # Using split fa -- so create separate output dirs and special gsub:
    foreach f (`cat mrna.lst`)
      mkdir psl/$f:t:r
    end
    echo '#LOOP \
/cluster/home/kent/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -q=dnax -t=dnax -mask=lower {check out line+ psl/$(root2)/$(root1)_$(root2).psl} \
#ENDLOOP' > gsub
    gensub2 human.lst mrna.lst gsub spec
    ssh kk
    cd ~/hg14/bed/xenoEst
    para create spec
    para try, para check, para push, para check, ...
    # Sort xenoEst alignments:
    ssh eieio
    cd ~/hg14/bed/xenoEst
    pslSort dirs raw.psl /cluster/store2/temp psl/xenoEst*
    pslReps raw.psl cooked.psl /dev/null -minAli=0.10
    liftUp chrom.psl ../../jkStuff/liftAll.lft warn cooked.psl
    pslSortAcc nohead chrom /cluster/store2/temp chrom.psl
    pslCat -dir chrom > xenoEst.psl
    rm -r chrom raw.psl cooked.psl chrom.psl
    # Load into database as so:
    ssh hgwdev
    cd ~/hg14/bed/xenoEst
    hgLoadPsl hg14 xenoEst.psl -tNameIx
    # Sequence info should have already been loaded into database (PRELOAD)


# PRODUCING FUGU ALIGNMENTS (TODO)
    # Distribute fugu sequence to /iscratch/i/fugu/ (if it isn't already there)
    ssh kkr1u00
    rm -rf /iscratch/i/fugu
    mkdir /iscratch/i/fugu
    cp -p /cluster/store3/fuguSeq/split2.5Mb/*.fa /iscratch/i/fugu
    ~kent/bin/iSync
DONE TO HERE
    ssh kk
    mkdir ~/hg14/bed/blatFugu
    cd ~/hg14/bed/blatFugu
    mkdir psl
    foreach f (~/hg14/?{,?}/NT_??????/NT_??????.fa)
      set c=$f:t:r
      mkdir -p psl/$c
    end
    ls -1S /iscratch/i/fugu/*.fa > fugu.lst
    ls -1S /scratch/hg/gs.15/build32/trfFa/*.fa.trf > human.lst
    cp ~/hg13/bed/blatFugu gsub .
    gensub2 human.lst fugu.lst gsub spec
    para create spec
    para try
    para check
    para push
    para check

    # When cluster run is done, sort alignments:
    ssh eieio
    cd ~/hg14/bed/blatFugu
    pslCat -dir psl/NT_??????.fa | \
      liftUp -type=.psl stdout ~/hg14/jkStuff/liftAll.lft warn stdin | \
      pslSortAcc nohead chrom temp stdin

    # Rename to correspond with tables as so and load into database:
    ssh hgwdev
    cd ~/hg14/bed/blatFugu/chrom
    rm -f chr*_blatFugu.psl
    foreach i (chr?{,?}{,_random}.psl)
        set r = $i:r
        mv $i ${r}_blatFugu.psl
    end
    hgLoadPsl hg14 *.psl
    # Make fugu /gbdb/ symlink and load Fugu sequence data.
    mkdir /gbdb/hg14/fuguSeq
    cd /gbdb/hg14/fuguSeq
    ln -s /cluster/store3/fuguSeq/fugu_v3_mask.fasta
    cd /cluster/store2/tmp
    hgLoadRna addSeq hg14 /gbdb/hg14/fuguSeq/fugu_v3_mask.fasta


TIGR GENE INDEX (TODO)
  o mkdir -p ~/hg14/bed/tigr    
    cd ~/hg14/bed/tigr  
    wget ftp://ftp.tigr.org/private/HGI_ren/TGI_track_HumanGenome_build32.tgz
    tar xvzf TGI*.tgz
    foreach f (*cattle*)
      set f1 = `echo $f | sed -e 's/cattle/cow/g'`
      mv $f $f1
    end
    foreach o (mouse cow human pig rat)
      setenv O $o
      foreach f (chr*_$o*s)
        tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
      end
    end
    ldHgGene -exon=TC hg14 tigrGeneIndex *.gff


LOAD MOUSEREF TRACK (todo)
    First copy in data from eieio to ~/hg14/bed/mouseRef.  
    Then substitute 'genome' for the appropriate chromosome 
    in each of the alignment files.  Finally do:
       hgRefAlign webb hg14 mouseRef *.alignments

LOAD AVID MOUSE TRACK (todo)
      ssh cc98
      cd ~/hg14/bed
      mkdir avidMouse
      cd avidMouse
      wget http://pipeline.lbl.gov/tableCS-LBNL.txt
      hgAvidShortBed *.txt avidRepeat.bed avidUnique.bed
      hgLoadBed avidRepeat avidRepeat.bed
      hgLoadBed avidUnique avidUnique.bed

LOAD SNPS (TODO)
      ssh hgwdev
      cd ~/hg14/bed
      mkdir snp
      cd snp
      mkdir build110
      cd build110
      ln -s ../../../seq_contig.md .
      ln -s ~/hg13/bed/cpgIsland/filter.awk .

     -Download SNPs from ftp://ftp.ncbi.nlm.nih.gov/pub/sherry/gp.ncbi.b31.gz
     -Unpack
      calcFlipSnpPos seq_contig.md gp.ncbi.b31 gp.ncbi.b31.flipped
      mv gp.ncbi.b31 gp.ncbi.b31.original
      gzip gp.ncbi.b31.original

      grep RANDOM       gp.ncbi.b31.flipped >  snpTsc.txt
      grep MIXED        gp.ncbi.b31.flipped >> snpTsc.txt
      grep BAC_OVERLAP  gp.ncbi.b31.flipped >  snpNih.txt
      grep OTHER        gp.ncbi.b31.flipped >> snpNih.txt

      awk -f filter.awk snpTsc.txt > snpTsc.contig.bed
      awk -f filter.awk snpNih.txt > snpNih.contig.bed

      liftUp snpTsc.bed ../../jkStuff/liftAll.lft warn snpTsc.contig.bed
      liftUp snpNih.bed ../../jkStuff/liftAll.lft warn snpNih.contig.bed

      hgLoadBed hg14 snpTsc snpTsc.bed
      hgLoadBed hg14 snpNih snpNih.bed

     -gzip all of the big files

LOAD ENSEMBL GENES (TODO)
     cd ~/hg14/bed
     mkdir ensembl
     cd ensembl

        Get the ensembl gene data as below:
        GET http://www.ebi.ac.uk/~stabenau/human_8_30.gtf.gz > ensGene.gz
        (The above may only be a temproary location)

        Get the ensembl protein data from http://www.ensembl.org/Homo_sapiens/martview
        Follow this sequence through the pages:
        Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
        Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
        Page 3) Choose the "Structures" box. 
        Page 4) Choose Transcripts/Proteins and GTF as the ouput, choose gzip compression and then hit export.

        gunzip the file and name to ensembl.gtf


# Ensembl handles random chromosomes differently than us, so we
# strip this data.  Fortunately it just loses a couple of genes.
     grep -v ^6_DR51 ensembl.gtf | grep -v _NT_ > unrandom.gtf

# Add "chr" to front of each line in the gene data gtf file to make 
# it compatible with ldHgGene 
     ~matt/bin/addchr.pl unrandom.gtf ensGene.gtf
     ./fixEns.pl ensGene.gtf ensFixed.gtf 
     ldHgGene hg14 ensGene ensGene.gtf

o - Load Ensembl peptides:
     Get them from ensembl as above in the gene section except for
     Page 3) Choose the "Sequences" box. 
     Page 4) Choose GTF as the ouput, choose gzip compression and then hit export.

     Substitute ENST for ENSP in ensPep with the program called subs
     edit subs.in to read: ENSP|ENST
     subs -e ensPep.fa > /dev/null

     Run fixPep.pl ensPep.fa ensembl.pep
     hgPepPred hg14 generic ensPep ensembl.pep


    LOAD SANGER 22 Pseudogenes 
    cd ~/hg14/bed/sanger22
    cp ~/hg10/bed/sanger22/cChr22.3.lx.pseudogene.gff .
    replace ^chr22 with hg10:chr22 in Chr22.3.lx.pseudogene.gff
    liftUp -type=.gff pseudo.gff hg14.lft Chr22.3.lx.pseudogene.gff
    ldHgGene hg14 sanger22pseudo pseudo.gff

LOAD SANGER22 GENES  (TODO)
      cd ~/hg14/bed
      mkdir sanger22
      cd sanger22
      not sure where these files were downloaded from
      grep -v Pseudogene Chr22*.genes.gff | hgSanger22 hg14 stdin Chr22*.cds.gff *.genes.dna *.cds.pep 0
          | ldHgGene hg14 sanger22pseudo stdin
  # Note: this creates sanger22extras, but doesn't currently create
  # a correct sanger22 table, which are replaced in the next steps
      sanger22-gff-doctor Chr22.3.1x.cds.gff Chr22.3.1x.genes.gff \
          | ldHgGene hg14 sanger22 stdin
      sanger22-gff-doctor -pseudogenes Chr22.3.1x.cds.gff Chr22.3.1x.genes.gff \
          | ldHgGene hg14 sanger22pseudo stdin

      hgPepPred hg14 generic sanger22pep *.pep

              
              LOAD SANGER 20 GENES (todo)
     # First download files from James Gilbert's email to ~/hg14/bed/sanger20 and
     # go to that directory while logged onto hgwdev.  Then:
        grep -v Pseudogene chr_20*.gtf | ldHgGene hg14 sanger20 stdin
        hgSanger20 hg14 *.gtf *.info


# JAX ORTHOLOG (still valid???) (TODO)
    # Add Jackson labs info
    cd ~/hg14/bed
    mkdir jaxOrtholog
    cd jaxOrtholog
    wget ftp://ftp.informatics.jax.org/pub/informatics/reports/HMD_Human3.rpt
    cp /cluster/store1/gs.12/build29/bed/jaxOrtholog/filter.awk .
    awk -f filter.awk *.rpt > jaxOrtholog.tab
    # Drop (just in case), create and load the table like this:
    echo 'drop table jaxOrtholog;' | hgsql hg14
    hgsql hg14 < ~/src/hg/lib/jaxOrtholog.sql
    echo "load data local infile '"`pwd`"/jaxOrtholog.tab' into table \
          jaxOrtholog;" \
    | hgsql hg14


LOAD RNAGENES
      ssh hgwdev
      mkdir -p ~/hg14/bed/rnaGene
      cd ~/hg14/bed/rnaGene
      wget ftp://ftp.genetics.wustl.edu/pub/eddy/pickup/ncrna-hg14.gff.gz
      gunzip -c ncrna-hg14.gff.gz | grep -v '^#' > contig.gff
      liftUp chrom.gff ../../jkStuff/liftAll.lft warn contig.gff
      echo 'drop table hgRnaGene;' | hgsql hg14
      hgsql hg14 < ~/kent/src/hg/lib/rnaGene.sql
      hgRnaGenes hg14 chrom.gff

LOAD EXOFISH (todo)
     - login to hgwdev
     - cd /cluster/store5/gs.15/build32/bed
     - mkdir exoFish
     - cd exoFish
     - hg14 < ~kent/src/hg/lib/exoFish.sql
     - Put email attatchment from Olivier Jaillon (ojaaillon@genoscope.cns.fr)
       into /cluster/store5/gs.15/build32/bed/exoFish/all_maping_ecore
     - awk -f filter.awk all_maping_ecore > exoFish.bed
     - hgLoadBed hg14 exoFish exoFish.bed

LOAD MOUSE SYNTENY (TODO)
     ssh hgwdev
     mkdir -p ~/hg14/bed/mouseSyn
     cd ~/hg14/bed/mouseSyn
     # Saved Michael Kamal's email attachment: allDirectedSegmentsBySize300.txt
     # Process the .txt file (minus header) into a bed 6 + file:
     grep -v "^#" allDirectedSegmentsBySize300.txt \
       | awk '($6 > $5) {printf "%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\n", $4, $5-1, $6, $1, 999, $7, $2-1, $3, $8;} \
              ($5 > $6) {printf "%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\n", $4, $6-1, $5, $1, 999, $7, $2-1, $3, $8;}' \
       > mouseSynWhd.bed
     hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/mouseSynWhd.sql \
       hg14 mouseSynWhd mouseSynWhd.bed

LOAD GENIE (todo)
     - cat */ctg*/ctg*.affymetrix.gtf > predContigs.gtf
     - liftUp predChrom.gtf ../../jkStuff/liftAll.lft warn predContigs.gtf
     - ldHgGene hg14 genieAlt predChrom.gtf

     - cat */ctg*/ctg*.affymetrix.aa > pred.aa
     - hgPepPred hg14 genie pred.aa 

     - hg14
         mysql> delete * from genieAlt where name like 'RS.%';
         mysql> delete * from genieAlt where name like 'C.%';

LOAD SOFTBERRY GENES (TODO)
     mkdir -p ~/hg14/bed/softberry
     cd ~/hg14/bed/softberry
     wget ftp://www.softberry.com/pub/sc_fgenesh_hum_mar03up/sc_fgenesh_hum_mar03up.tar.gz
     gunzip -c sc_fgenesh_hum_mar03up.tar.gz | tar xvf -
     ldHgGene hg14 softberryGene chr*.gff
     hgPepPred hg14 softberry *.protein
     hgSoftberryHom hg14 *.protein

LOAD GENEID GENES (TODO)
    mkdir ~/hg14/bed/geneid
    cd ~/hg14/bed/geneid
    mkdir download
    cd download
    # Now download *.gtf and *.prot from 
    wget -r http://www1.imim.es/genepredictions/H.sapiens/golden_path_20021114/geneid_v1.1/
    # oops, due to links in the index.html, it tries to get too much.  
    # ctrl-c it when it starts to download other directories.
    mv www1.imim.es/genepredictions/H.sapiens/golden_path_20021114/geneid_v1.1/*.{gtf,prot} .
    rm -r www1.imim.es/
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene hg14 geneid download/*.gtf -exon=CDS
    hgPepPred hg14 generic geneidPep download/*-fixed.prot

LOAD ACEMBLY (TODO)
    mkdir -p ~/hg14/bed/acembly
    cd ~/hg14/bed/acembly
    # Get acembly*gene.gff from Jean and Danielle Thierry-Mieg
    wget ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_31.human.genes/acembly.ncbi_31.genes.proteins.fasta.tar.gz
    wget ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_31.human.genes/acembly.ncbi_31.genes.gff.tar.gz
    gunzip -c acembly.ncbi_31.genes.gff.tar.gz | tar xvf -
    gunzip -c acembly.ncbi_31.genes.proteins.fasta.tar.gz | tar xvf -
    cd acembly.ncbi_31.genes.gff
    # Save just the floating-contig features to different files for lifting 
    # and lift up the floating-contig features to chr*_random coords:
    foreach f (acemblygenes.*.gff)
      set c=$f:r:e
      egrep '^[a-zA-Z0-9]+\|NT_[0-9][0-9][0-9][0-9][0-9][0-9]' $f | \
        perl -wpe 's/^(\w+)\|(\w+)/$1\/$2/' > ctg-chr${c}_random.gff
      if (-e ../../../$c/lift/random.lft) then
        liftUp chr${c}_random.gff ../../../$c/lift/random.lft warn \
          ctg-chr${c}_random.gff
      endif
      # Strip out _random or floating contig lines from the normal chrom gff,
      # and add the "chr" prefix:
      grep -v ^$c\| $f | grep -v ^Hs | perl -wpe 's/^/chr/;' > chr$c.gff
    end

    cd ../acembly.ncbi_31.genes.proteins.fasta
    #- Remove G_t*_ prefixes from acemblyproteins.*.fasta:
    foreach f (acemblyproteins.*.fasta)
      perl -wpe 's/^\>G_t[\da-zA-Z]+_/\>/' $f > chr$f:r:e.fa
    end

    #- Load into database:
    cd ..
    ldHgGene hg14 acembly acembly.ncbi_31.genes.gff/chr*.gff
    hgPepPred hg14 generic acemblyPep \
      acembly.ncbi_31.genes.proteins.fasta/chr*.fa

LOAD GENOMIC DUPES (todo)
o - Load genomic dupes
    ssh hgwdev
    cd ~/hg14/bed
    mkdir genomicDups
    cd genomicDups
    wget http://codon/jab/web/takeoff/hg1433_dups_for_kent.zip
    unzip *.zip
    awk -f filter.awk oo33_dups_for_kent > genomicDups.bed
    mysql -u hgcat -pbigSECRET hg14 < ~/src/hg/lib/genomicDups.sql
    hgLoadBed hg14 -oldTable genomicDups genomicDupes.bed

LOAD NCI60 (TODO)
o - # ssh hgwdev
    cd /projects/cc/hg/mapplots/data/NCI60/dross_arrays_nci60/
    mkdir hg14
    cd hg14
    findStanAlignments hg14 ../BC2.txt.ns ../../image/cumulative_plates.011204.list.human hg14.image.psl >& hg14.image.log 
    cp ../experimentOrder.txt ./
    sed -e 's/ / \.\.\//g' < experimentOrder.txt > epo.txt
    stanToBedAndExpRecs  hg14.image.good.psl hg14.nci60.exp hg14.nci60.bed `cat epo.txt`
    hg14S -A < ../../scripts/nci60.sql
    echo "load data local infile 'hg14.nci60.bed' into table nci60" | hg14S -A
    mkdir /cluster/store5/gs.15/build32/bed/nci60
    mv hg14.nci60.bed /cluster/store5/gs.15/build32/bed/nci60
    rm *.psl 

LOAD AFFYRATIO [GNF] (TODO)
o - # ssh hgwdev
    cd /cluster/store1/sugnet/
    mkdir gs.15
    mkdir gs.15/build32
    mkdi20r gs.15/build32/affyGnf
    cd gs.15/build32/affyGnf
    cp /projects/compbiodata/microarray/affyGnf/sequences/HG-U95Av2_target ./
    ls -1 /cluster/store5/gs.15/build32/trfFa.1204/ > allctg.lst
    echo "/cluster/store1/sugnet/gs.15/build32/affyGnf/HG-U95Av2_target" > affy.lst
    echo '#LOOP\n/cluster/bin/i386/blat -mask=lower -minIdentity=95 -ooc=/cluster/store5/gs.15/build32/jkStuff/post.refCheck.old/11.ooc /cluster/store5/gs.15/build32/trfFa.1204/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
    gensub2 allctg.lst affy.lst template.sub para.spec
    # ssh kkr1u00
    para create para.spec
    para try
    para check
    para push
    # exit kkr1u00
    pslSort dirs hg14.affy.psl tmp psl >& pslSort.log
    liftUp hg14.affy.lifted.psl /cluster/store5/gs.15/build32/jkStuff/liftAll.lft warn hg14.affy.psl
    pslAffySelect seqIdent=.95 basePct=.95 in=hg14.affy.lifted.psl out=hg14.affy.pAffySelect.95.95.psl
    affyPslAndAtlasToBed hg14.affy.pAffySelect.95.95.psl  /projects/compbiodata/microarray/affyGnf/human_atlas_U95_gnf.noquotes.txt affyRatio.bed affyRatio.exr >& affyPslAndAtlasToBed.log 
    hg14S -A </projects/compbiodata/microarray/affyGnf/browserFiles/affyRatio.sql 
    echo "load data local infile 'affyRatio.bed' into table affyRatio" | hg14S -A
    mkdir /cluster/store5/gs.15/build32/bed/affyGnf
    rm -rf psl tmp err *.psl *.bed HG-U95Av2_target 

# LOAD SAGE DATA (TODO)
    ssh hgwdev
    cd ~/kent/src/hg/sage
    make
    # XXX = uniGene build for which SAGE was built -- not necessarily current!
    # Figure out the build number by peeking at this file:
    lynx ftp://ftp.ncbi.nih.gov/pub/sage/map/info.txt
    set version = XXX
    mkdir /projects/cc/hg/sugnet/sage/sage.$version
    cd /projects/cc/hg/sugnet/sage/sage.$version
    ncftp ftp://ftp.ncbi.nih.gov/pub/sage
      mget -R map/readme.txt map/info.txt extr info map/Hs
      quit
    mkdir map
    mv Hs map
    cd map/Hs/NlaIII
    unzip -j SAGEmap_tag_ug-rel.zip
    cd ../../../extr/
    ../../scripts/summarizeCounts.pl expCounts.tab ./SAGE_*
    ../../scripts/countGenesPerTag.pl expCounts.tab allTags.count.tab
    ../../scripts/createArraysForTags.pl allTags.count.tab tagExpArrays.tab \
      ./SAGE_*
    ../../scripts/countsPerExp.pl expCounts.tab expList.tab
    cd ../map/Hs/NlaIII/ 
    perl -e 'while (<>) { \
               chomp($_); \
               @p = split(/\t/, $_); \
               print "$p[2]\t$p[3]\t$p[0]\n"\
             }' \
      < SAGEmap_tag_ug-rel | sort | sed -e 's/ /_/g' \
      > SAGEmap_ug_tag-rel_Hs
    cd -
    createSageSummary ../map/Hs/NlaIII/SAGEmap_ug_tag-rel_Hs \
      tagExpArrays.tab sageSummary.sage
    # Create the uniGene alignments 
    # ~/hg14/uniGene/hg14.uniGene.lifted.pslReps.psl
    # -- see "MAKE UNIGENE ALIGNMENTS" below
    cd /projects/cc/hg/sugnet/sage/sage.XXX/extr
    addAveMedScoreToPsls \
      ~/hg14/bed/uniGene.$version/hg14.uniGene.lifted.pslReps.psl \
      sageSummary.sage  uniGene.wscores.bed
    hgLoadBed hg14 uniGene_2 uniGene.wscores.bed
    hgsql hg14 < ~kent/src/hg/lib/sage.sql 
    echo "load data local infile 'sageSummary.sage' into table sage" \
        | hgsql hg14
    cd ../info
    ../../scripts/parseRecords.pl ../extr/expList.tab  > sageExp.tab
    hgsql hg14 < ~/kent/src/hg/lib/sageExp.sql 
    echo "load data local infile 'sageExp.tab' into table sageExp" | hgsql hg14
    # update ~/kent/src/hg/makeDb/trackDb/human/hg14/uniGene_2.html 
    # with current uniGene date. 

    
# MAKE UNIGENE ALIGNMENTS (TODO)
    # Download of the latest UniGene version is now automated by a 
    # cron job -- see /cluster/home/angie/crontab , 
    # /cluster/home/angie/unigeneVers/unigene.csh .  
    # If hgwdev gets rebooted, that needs to be restarted... maybe there's 
    # a more stable place to set up that cron job.  

    # substitute XXX -> the uniGene version used by SAGE, if building the 
    # uniGene/SAGE track;  or just the latest uniGene version in 
    # /projects/cc/hg/sugnet/uniGene/ , if doing uniGene alignments only.
    set version = XXX
    cd /projects/cc/hg/sugnet/uniGene/uniGene.$version
    gunzip Hs.seq.uniq.gz 
    ../countSeqsInCluster.pl Hs.data counts.tab
    ../parseUnigene.pl Hs.seq.uniq Hs.seq.uniq.simpleHeader.fa leftoverData.tab
    # Distribute UniGene sequence to /iscratch/i/ (kkstore can see /projects)
    ssh kkstore
    set version = XXX # same as above
    mkdir -p /iscratch/i/uniGene.$version
    cp -p \
  /projects/cc/hg/sugnet/uniGene/uniGene.$version/Hs.seq.uniq.simpleHeader.fa \
      /iscratch/i/uniGene.$version
    ssh kkr1u00
    ~kent/bin/iSync
    ssh kk
    set version = XXX # same as above
    mkdir -p ~/hg14/bed/uniGene.$version
    cd ~/hg14/bed/uniGene.$version
    ls -1S /cluster/store5/gs.15/build32/trfFa/* > allctg.lst
    ls -1S /iscratch/i/uniGene.$version/Hs.seq.uniq.simpleHeader.fa \
      > uniGene.lst
    echo '#LOOP\n/cluster/bin/i386/blat -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
    gensub2 allctg.lst uniGene.lst template.sub para.spec
    para create para.spec
    mkdir psl
    para try
    para check
    para push
    # ssh eieio
    set version = XXX # same as above
    cd ~/hg14/bed/uniGene.$version
    pslSort dirs raw.psl tmp psl >& pslSort.log
    liftUp -type=.psl stdout ../../jkStuff/liftAll.lft warn raw.psl \
    | pslReps -minCover=0.2 -sizeMatters -minAli=0.98 -nearTop=0.002 \
      stdin hg14.uniGene.lifted.pslReps.psl /dev/null
    # use hg14.uniGene.lifted.pslReps.psl for building SAGE track (above).


LOADING MOUSE MM3 BLASTZ ALIGNMENTS FROM PENN STATE: (DONE 03/17/03)

    # Translate Penn State .lav files into sorted axt:
    ssh eieio
    set base="/cluster/store5/gs.15/build32/bed/blastz.mm3.2003-03-17-ASH"
    set seq1_dir="/cluster/store5/gs.15/build32/mixedNib/"
    set seq2_dir="/cluster/store2/mm.2003.02/mm3/mixedNib/"
    set tbl="blastzMm3"
    cd $base
    mkdir -p axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=$base/axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | lavToAxt stdin $seq1_dir $seq2_dir stdout \
        | axtSort stdin $out
      popd
    end

    # Translate the sorted axt files into psl:
    cd $base
    mkdir -p pslChrom
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo $c
      axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # Load tables
    ssh hgwdev
    set base="/cluster/store5/gs.15/build32/bed/blastz.mm3.2003-03-17-ASH"
    set tbl="blastzMm3"
    cd $base/pslChrom
    hgLoadPsl hg14 chr*_${tbl}.psl

MAKING THE BLASTZBESTMOUSE TRACK FROM PENN STATE MM3 AXT FILES (DONE 03/17/03)

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh eieio
    set base="/cluster/store5/gs.15/build32/bed/blastz.mm3.2003-03-17-ASH"
    set tbl="blastzBestMm3"
    cd $base
    mkdir -p axtBest pslBest
    foreach chrdir (lav/chr*)
      set chr=$chrdir:t
      echo axtBesting $chr
      axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300
      echo translating axtBest to psl for $chr
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
    end
    # Load tables
     ssh hgwdev
     set base="/cluster/store5/gs.15/build32/bed/blastz.mm3.2003-03-17-ASH"
     set tbl="blastzBestMm3"
     cd $base/pslBest
     hgLoadPsl hg14 chr*_${tbl}.psl

    # Make /gbdb links and add them to the axtInfo table:
# Not done for build 32:
     mkdir -p /gbdb/hg14/axtBestMm3
     cd /gbdb/hg14/axtBestMm3
     foreach f ($base/axtBest/chr*.axt)
       ln -s $f .
     end
     cd $base/axtBest
     rm -f axtInfoInserts.sql
     touch axtInfoInserts.sql
     foreach f (/gbdb/hg14/axtBestMm3/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo VALUES ('mm3','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
     hgsql hg14 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql hg14 < axtInfoInserts.sql

MAKING THE AXTTIGHT FROM AXTBEST (DONE 03/17/03)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh eieio
    cd ~/hg14/bed/blastz.mm3.2003-03-17-ASH/axtBest
    mkdir -p ../axtTight
    foreach i (*.axt)
      echo $i
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end
    # translate to psl
    cd ../axtTight
    mkdir -p ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightMm3.psl
    end
    # Load tables into database
    ssh hgwdev
    cd ~/hg14/bed/blastz.mm3.2003-03-17-ASH/pslTight
    hgLoadPsl hg14 chr*_blastzTightMm3.psl

BEGINNING OF RAT BLASTZ

LOADING RAT RN2 BLASTZ ALIGNMENTS FROM PENN STATE: (DONE 03/19/03)

    # Translate Penn State .lav files into sorted axt:
    ssh eieio
    set base="/cluster/store5/gs.15/build32/bed/blastz.rn2.2003-03-18-ASH"
    set seq1_dir="/cluster/store5/gs.15/build32/mixedNib/"
    set seq2_dir="/cluster/store4/rn2/mixedNib/"
    set tbl="blastzRn2"
    cd $base
    mkdir -p axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=$base/axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | lavToAxt stdin $seq1_dir $seq2_dir stdout \
        | axtSort stdin $out
      popd
    end

# STOPPED HERE -- big data, low demand.
    # Translate the sorted axt files into psl:
    cd $base
    mkdir -p pslChrom
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    # Load tables
    ssh hgwdev
    set base="/cluster/store5/gs.15/build32/bed/blastz.rn2.2003-03-18-ASH"
    set tbl="blastzRn2"
    cd $base/pslChrom
    hgLoadPsl hg14 chr*_${tbl}.psl

MAKING THE BLASTZBESTRAT TRACK FROM PENN STATE RN2 AXT FILES (DONE 03/19/03)

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh eieio
    set base="/cluster/store5/gs.15/build32/bed/blastz.rn2.2003-03-18-ASH"
    set tbl="blastzBestRn2"
    cd $base
    mkdir -p axtBest pslBest
    foreach chrdir (lav/chr*)
      set chr=$chrdir:t
      echo axtBesting $chr
      axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300
      echo translating axtBest to psl for $chr
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
    end

    # Load tables
     ssh hgwdev
     set base="/cluster/store5/gs.15/build32/bed/blastz.rn2.2003-03-18-ASH"
     set tbl="blastzBestRn2"
     cd $base/pslBest
     hgLoadPsl hg14 chr*_${tbl}.psl

    # Make /gbdb links and add them to the axtInfo table:
# Not done for build 32:
     mkdir -p /gbdb/hg14/axtBestRn2
     cd /gbdb/hg14/axtBestRn2
     foreach f ($base/axtBest/chr*.axt)
       ln -s $f .
     end
     cd $base/axtBest
     rm -f axtInfoInserts.sql
     touch axtInfoInserts.sql
     foreach f (/gbdb/hg14/axtBestRn2/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo VALUES ('rn2','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
     hgsql hg14 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql hg14 < axtInfoInserts.sql

MAKING THE AXTTIGHT FROM AXTBEST (DONE 03/19/03)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh eieio
    cd ~/hg14/bed/blastz.rn2.2003-03-18-ASH/axtBest
    mkdir -p ../axtTight
    foreach i (*.axt)
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end
    # translate to psl
    cd ../axtTight
    mkdir -p ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightRn2.psl
    end
    # Load tables into database
    ssh hgwdev
    cd ~/hg14/bed/blastz.rn2.2003-03-18-ASH/pslTight
    hgLoadPsl hg14 chr*_blastzTightRn2.psl

XXX END OF RAT BLASTZ

BEGINNING OF HUMAN BLASTZ

LOADING HUMAN HG14 (SELF) BLASTZ ALIGNMENTS: (DONE 03/19/20)

    # Translate Penn State .lav files into sorted axt, with alignments 
    # to self/diagonal dropped:
    ssh eieio
    set base="/cluster/store5/gs.15/build32/bed/blastz.hg14.2003-03-18-ASH"
    set seq1_dir="/cluster/store5/gs.15/build32/mixedNib/"
    set seq2_dir="/cluster/store5/gs.15/build32/mixedNib/"
    set tbl="blastzHuman"
    cd $base
    mkdir -p axtChrom
    # sometimes alignments are so huge that they cause axtSort to run out 
    # of memory.  Run them in two passes like this:
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=$base/axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      foreach d (*.lav)
        set smallout=$d.axt
        lavToAxt $d $seq1_dir $seq2_dir stdout \
        | axtDropSelf stdin stdout \
        | axtSort stdin $smallout
      end
      cat `ls -1 *.lav.axt | sort -g` \
        > $out
      popd
    end

# STOPPED HERE -- big data, low demand.
    # Translate the sorted axt files into psl:
    cd $base
    mkdir -p pslChrom
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo translating $c.axt to psl
      axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # Load tables
    ssh hgwdev
    set base="/cluster/store5/gs.15/build32/bed/blastz.hg14.2003-03-18-ASH"
    set tbl="blastzHuman"
    cd $base/pslChrom
    hgLoadPsl hg14 chr*_${tbl}.psl

MAKING THE BLASTZBESTHUMAN TRACK FROM UNFILTERED AXT FILES (DONE 03/20/03)

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh eieio
    set base="/cluster/store5/gs.15/build32/bed/blastz.hg14.2003-03-18-ASH"
    set tbl="blastzBestHuman"
    cd $base
    mkdir -p axtBest pslBest
    # run axtBest in 2 passes to reduce size of the input to final axtBest:
    foreach chrdir (lav/*)
      set chr=$chrdir:t
      echo two-pass axtBesting $chr
      foreach a ($chrdir/*.axt)
        axtBest $a $chr $a:r.axtBest
      end
      cat `ls -1 $chrdir/*.axtBest | sort -g` \
        > $chrdir/$chr.axtBestPieces
      axtBest $chrdir/$chr.axtBestPieces $chr axtBest/$chr.axt
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
    end

    # Load tables
     ssh hgwdev
     set base="/cluster/store5/gs.15/build32/bed/blastz.hg14.2003-03-18-ASH"
     set tbl="blastzBestHuman"
     cd $base/pslBest
     hgLoadPsl hg14 chr*_${tbl}.psl

    # Make /gbdb links and add them to the axtInfo table:
# Not done for build 32:
     mkdir -p /gbdb/hg14/axtBestHg14
     cd /gbdb/hg14/axtBestHg14
     foreach f ($base/axtBest/chr*.axt)
       ln -s $f .
     end
     cd $base/axtBest
     rm -f axtInfoInserts.sql
     touch axtInfoInserts.sql
     foreach f (/gbdb/hg14/axtBestHg14/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo VALUES ('hg14','Blastz Best Human Self','$chr','$f');" \
         >> axtInfoInserts.sql
     end
     hgsql hg14 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql hg14 < axtInfoInserts.sql

MAKING THE AXTTIGHT FROM AXTBEST (DONE 03/20/03)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh eieio
    cd ~/hg14/bed/blastz.hg14.2003-03-18-ASH/axtBest
    mkdir -p ../axtTight
    foreach i (*.axt)
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end
    # translate to psl
    cd ../axtTight
    mkdir -p ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightHuman.psl
    end
    # Load tables into database
    ssh hgwdev
    cd ~/hg14/bed/blastz.hg14.2003-03-18-ASH/pslTight
    hgLoadPsl hg14 chr*_blastzTightHuman.psl


XXX END OF HUMAN BLASTZ


LIFTING REPEATMASKER .ALIGN FILES (TODO)

foreach d (?{,?}/NT_??????)
  set c=$d:t
  cd $d
  echo $c to $c.fa.align
  /cluster/bin/scripts/liftRMAlign.pl $c.lft > $c.fa.align
  cd ../..
end

foreach chr (?{,?})
  cd $chr
  echo making symbolic links for chr$chr NT .fa.align files
  foreach ctg (NT_??????)
    ln -s $ctg/$ctg.fa.align
  end
  cd ..
  if (-e $chr/lift/ordered.lft) then
    echo making $chr/chr$chr.fa.align
    /cluster/bin/scripts/liftRMAlign.pl $chr/lift/ordered.lft \
      > $chr/chr$chr.fa.align
  endif
  if (-e $chr/lift/random.lft) then
    echo making $chr/chr${chr}_random.fa.align
    /cluster/bin/scripts/liftRMAlign.pl $chr/lift/random.lft \
      > $chr/chr${chr}_random.fa.align
  endif
  echo removing symbolic links for chr$chr NT .fa.align files
  rm $chr/NT_??????.fa.align
end


TWINSCAN GENE PREDICTIONS (TODO)
    mkdir -p ~/hg14/bed/twinscanchr_gtf
    cd ~/hg14/bed/twinscan
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
      rm -f chr$c.{gtf,ptx}
      wget http://genome.cs.wustl.edu/~bio/human/NCBI31/12-30-02/chr_gtf/chr$c.gtf
      wget http://genome.cs.wustl.edu/~bio/human/NCBI31/12-30-02/chr_ptx/chr$c.ptx
      # clean up chrom name and put chrom in transcript_id:
      perl -wpe 's/^chr(\w+)\.\d+\.\d+(.*)transcript_id "(\d+\.\d+).a"/chr$1$2transcript_id "$1.$3.a"/' \
        < chr$c.gtf > chr$c-fixed.gtf
      # pare down protein FASTA header to id and add missing .a:
      perl -wpe 's/^\>.*\s+source_id\s*\=\s*(\S+)\s+chr=(\w+).*$/\>$2.$1.a/;' \
        < chr$c.ptx > chr$c-fixed.fa
    end
    ldHgGene hg14 twinscan chr*-fixed.gtf -exon=CDS
    hgPepPred hg14 generic twinscanPep chr*-fixed.fa


# LOAD CHIMP DATA (TODO)
    # Download the chimp sequence and distribute to /iscratch/i
    ssh hgwdev
    mkdir /cluster/store1/chimpSeq
    cd /cluster/store1/chimpSeq
    wget http://www.cs.uni-duesseldorf.de/~ebersber/annotation_track_chimp/downloads/mpi-aligned_seqparts_jun02.fa.gz
    gunzip *.gz
    ssh kkr1u00
    mkdir /iscratch/i/chimp
    cp -p /cluster/store1/chimpSeq/*.fa /iscratch/i/chimp/
    # Make sure it unpacked OK
    ~kent/bin/iSync
    ssh kk
    mkdir ~/hg14/bed/blatChimp
    cd ~/hg14/bed/blatChimp
    cp ~/hg13/bed/blatChimp/gsub .
    ls -1S /iscratch/i/chimp/*.fa > chimp.lst
    ls -1S /scratch/hg/gs.15/build32/trfFa.1204/*.fa.trf > human.lst
    mkdir psl
    gensub2 human.lst chimp.lst gsub spec
    para create spec
    para try
    para push
    para check

    # Sort alignments as so
    ssh eieio
    cd ~/hg14/bed/blatChimp
    pslCat -dir psl \
    | liftUp -type=.psl stdout ~/hg14/jkStuff/liftAll.lft warn stdin \
    | pslSortAcc nohead chrom temp stdin
    pslCat -dir chrom > blatChimp.psl

    ssh hgwdev
    cd ~/hg14/bed/blatChimp
    hgLoadPsl hg14 blatChimp.psl         


SGP GENE PREDICTIONS (TODO)
    mkdir -p ~/hg14/bed/sgp/download
    cd ~/hg14/bed/sgp/download
    foreach f (~/hg14/?{,?}/chr?{,?}{,_random}.fa)
      set chr = $f:t:r
      wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_20021114/SGP/$chr.gtf
      wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_20021114/SGP/$chr.prot
    end
    wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_20021114/SGP/chrUn.gtf -O chrUn_random.gtf
    wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_20021114/SGP/chrUn.prot -O chrUn_random.prot
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene hg14 sgpGene download/*.gtf -exon=CDS
    hgPepPred hg14 generic sgpPep download/*-fixed.prot

ALIGNED ANCIENT REPEATS FROM MOUSE BLASTZ (TODO)
    ssh eieio
    mkdir -p ~/hg14/bed/aarMm2
    cd ~/hg14/bed/aarMm2
    set mmdir=../blastz.mm2.2002-12-5-ASH
    foreach aar ($mmdir/aar/*.aar.gz)
      set c = $aar:t:r:r
      echo translating chr$c aar to axt
      zcat $aar \
      | $HOME/kent/src/hg/makeDb/hgAar/aarToAxt \
      | axtToPsl stdin $mmdir/S1.len $mmdir/S2.len stdout \
      > chr${c}_aarMm2.psl
    end   
    ssh hgwdev
    cd ~/hg14/bed/aarMm2
    hgLoadPsl hg14 *.psl

ALIGNMENT COUNTS FOR WIGGLE TRACK
    # this needs to be updated to reflected the full process.
 
    - Generate BED table of AARs used to select regions.
        cat ../bed/aarMm2/*.psl | awk 'BEGIN{OFS="\t"} {print $14,$16,$17,"aar"}' >aarMm2.bed
    
    - Generate background counts with windows that have a 6kb counts,
      with a maximum windows size of 512kb and sliding the windows by
        foreach axt (../../blastz.mm2.2002-08-01/axtBest/chr*.axt)
           set chr=$axt:t:r
           set tab=$chr.6kb-aar.cnts  (??? need better name ???)
           hgCountAlign -selectBed=aarMm2.bed -winSize=512000 -winSlide=1000 -fixedNumCounts=6000 -countCoords $axt $tab
        end

    - Generate counts for AARs with 50b windows, slide by 5b
        foreach axt (../../blastz.mm2.2002-08-01/axtBest/chr*.axt)
           set chr=$axt:t:r
           set tab=$chr.50b-aar.cnts  (??? need better name ???)
           hgCountAlign -selectBed=aarMm2.bed -winSize=50 -winSlide=5 $axt $tab
        end

    - Generate counts for all with 50b windows, slide by 5b
        foreach axt (../../blastz.mm2.2002-08-01/axtBest/chr*.axt)
           set chr=$axt:t:r
           set tab=$chr.50b.cnts  (??? need better name ???)
           hgCountAlign -winSize=50 -winSlide=5 $axt $tab
        end


REFFULL (TODO)

o ssh to eieio
 mkdir -p /cluster/store5/gs.15/build32/bed/refFull
 cd /cluster/store5/gs.15/build32/bed/refFull

 Download the sequence: wget ftp://blue3.ims.u-tokyo.ac.jp/pub/db/hgc/dbtss/ref-full.fa.gz

 gunzip it and split the ref-rull.fa file into about 200 pieces
 gunzip ref-full.fa.gz
 faSplit sequence ref-full.fa 50 splitRefFull
 ssh kkstore
 cd /cluster/store5/gs.15/build32/bed/refFull
 mkdir /scratch/hg/refFull
 splitRefFull* /scratch/hg/refFull/
 ls -1S /scratch/hg/gs.15/build32/contig.0729/*.fa > genome.lst
 ls -1S /scratch/hg/refFull/split*.fa > refFull.lst

o - Request the admins to do a binrsync to the cluster of /scratch/hg/refFull

o - Use BLAT to generate refFull alignments as so:
      Make sure that /scratch/hg/gs.15/build32/contig/ is loaded
      with NT_*.fa and pushed to the cluster nodes.

          ssh kk
          cd /cluster/store5/gs.15/build32/bed/refFull
          mkdir -p psl
#          run mkdirs.sh script to create sudirs in the psl directory
#               in order to modularize the blat job.
          gensub2 genome.lst refFull.lst gsub spec
          para create spec
          
    Now run a para try/push and para check in each one.

o - Process refFull alignments into near best in genome.
      cd ~/hg14/bed
      cd refFull
      pslSort dirs raw.psl /tmp psl/*
      pslReps -minCover=0.2 -sizeMatters -minAli=0.98 -nearTop=0.002 raw.psl contig.psl /dev/null
      liftUp -nohead all_refFull.psl ../../jkStuff/liftAll.lft carry contig.psl
      pslSortAcc nohead chrom /tmp all_refFull.psl

o - Load refFull alignments into database
      ssh hgwdev
      cd /cluster/store5/gs.15/build32/bed/refFull
      pslCat -dir chrom > refFullAli.psl
      hgLoadPsl hg14 -tNameIx refFullAli.psl


MAKING PROMOTER FILES
    cd /usr/local/apache/htdocs/goldenPath/14nov2002/bigZips
    featureBits hg14 -fa=upstream1000.fa refGene:upstream:1000
    zip upstream1000.zip upstream1000.fa
    featureBits hg14 -fa=upstream2000.fa refGene:upstream:2000
    zip upstream2000.zip upstream2000.fa
    featureBits hg14 -fa=upstream5000.fa refGene:upstream:5000
    zip upstream5000.zip upstream5000.fa
    rm upstream*.fa
    
MAKING MOUSE AND RAT SYNTENY
#
syntenicBest.pl -db=hg14 -table=blastzBestMm3
smooth.pl
joinsmallgaps.pl
fillgap.pl -db=hg14 -table=blastzBestMm3
synteny2bed.pl
hgLoadBed hg14 syntenyMouse ucsc100k.bed

syntenicBest.pl -db=hg14 -table=blastzBestRn2
smooth.pl
joinsmallgaps.pl
fillgap.pl -db=hg14 -table=blastzBestRn2
synteny2bed.pl
hgLoadBed hg14 syntenyRat ucsc100k.bed

