# for emacs: -*- mode: sh; -*-


This file describes how we made the browser database on 
NCBI build 31 (November, 2002 freeze)

[For importing GTF tracks, use /projects/compbio/bin/validate_gtf.pl]

HOW TO BUILD A ASSEMBLY FROM NCBI FILES
---------------------------------------

NOTE: It is best to run most of this stuff on eieio since it
is not adverse to handling files > 2Gb

0) Make gs.14 directory, gs.14/build31 directory, and gs.14/ffa directory.
   Make a symbolic link from /cluster/store1 to this location
	
	cd /cluster/store1
	ln -s (actual location)/gs.14 ./gs.14

   Make a symbolic link from your home directory to the build dir:

	ln -s /cluster/store4/gs.14/build31 ~/oo

1) Download seq_contig.md, ncbi_build31.agp, contig_overlaps.agp 
and contig fa file into gs.14/build31 directory. 

Download sequence.inf and ncbi_build31.fa files into gs.14/ffa, and unzip
ncbi_build31.fa.

*** For build31, files split into reference.agp/reference.fa (main O&O), DR51.agp/DR51.fa,
    and DR52.agp/DR52.fa. (alternate versions of MHC region).  These were concatenated 
    to get the ncbi_build31.agp and ncbi_build31.fa

2) Sanity check things with 
     /cluster/bin/i386/checkYbr build31/ncbi_build31.agp ffa/ncbi_build31.fa build31/seq_contig.md
     report any errors back to Richa and Greg at NCBI.

3) Convert fa files into UCSC style fa files and place in "contigs" directory
   inside the gs.14/build31 directory 

	cd build31
	mkdir contigs
	/cluster/bin/i386/faNcbiToUcsc -split -ntLast ../ffa/ncbi_build31.fa contigs

3.1) Make a fake chrM contig
cd ~/oo
mkdir M
copy in chrM.fa, chrM.agp and chrM.gl from previous version.
mkdir M/NT_999999
cp chrM.fa NT_999999/NT_999999.fa

4) Create lift files (this will create chromosome directory structure) and inserts file

	/cluster/bin/scripts/createNcbiLifts seq_contig.md .

5) Create contig agp files (will create contig directory structure)
	
	/cluster/bin/scripts/createNcbiCtgAgp seq_contig.md ncbi_build31.agp .

5.1) Create contig gl files

        ~kent/bin/i386/agpToGl contig_overlaps.agp . -md=seq_contig.md

6) Create chromsome agp files

	/cluster/bin/scripts/createNcbiChrAgp .

6.1) Copy over jkStuff
	mkdir jkStuff
        cp /cluster/store1/gs.13/build30/jkStuff/*.sh jkStuff
        cp /cluster/store1/gs.13/build30/jkStuff/*.csh jkStuff
        cp /cluster/store1/gs.13/build30/jkStuff/*.gsub jkStuff        

6.2) Patch in size of chromosome Y into Y/lift/ordered.lft 
     by grabbing it from the last line of Y/chrY.agp (not needed for build31)

6.2.5) Add last telomere gap of 100Kb in chr22.agp based on finished chr22.agp.
     Update size of chromosome 22 in 22/lift/ordered.lft with addition of this gap.

*** 6.3) Create chromosome gl files
  
        jkStuff/liftGl.sh contig.gl

7) Distribute contig .fa to appropriate directory (assumes all files
   are in "contigs" directory).

	/cluster/bin/scripts/distNcbiCtgFa contigs .
	rm -r contigs

8) Reverse complement NT contig fa files that are flipped in the assembly
   (uses faRc program) 

	/cluster/bin/scripts/revCompNcbiCtgFa seq_contig.md .

(NOTE: STS placements may be done at this point before repeat masking and 
using the .fas on NFS for QC analysis - all other placements should be 
done after repeat masking and distributing to cluster nodes)

9) Split contigs, run RepeatMasker, lift results
   Notes: 
   * If there is a new version of RepeatMasker, build it and ask the admins 
     to binrsync it (kkstore:/scratch/hg/RepeatMasker/*).
   * Contigs (*/NT_*/NT_*.fa) are split into 500kb chunks to make 
     RepeatMasker runs manageable on the cluster ==> results need lifting.
   * For the NCBI assembly we repeat mask on the sensitive mode setting
     (RepeatMasker -s)

        #- Split contigs into 500kb chunks:
        ssh eieio
        cd ~/oo
        foreach d ( ?{,?}/NT_* )
          cd $d
          set contig = $d:t
          faSplit size $contig.fa 500000 ${contig}_ -lift=$contig.lft \
            -maxN=500000
          cd ../..
        end

        #- Make the run directory and job list:
        cd ~/oo
        mkdir RMRun
        rm -f RMRun/RMJobs
        touch RMRun/RMJobs
        foreach d ( ?{,?}/NT_* )
          foreach f ( /cluster/store4/gs.14/build31/$d/NT_*_*.fa )
            set f = $f:t
            echo /cluster/bin/scripts/RMLocalSens \
                 /cluster/store4/gs.14/build31/$d $f \
                '{'check out line+ /cluster/store4/gs.14/build31/$d/$f.out'}' \
              >> RMRun/RMJobs
          end
        end

        #- Do the run
        ssh kk
        cd ~/oo/RMRun
        para create RMJobs
        para try, para check, para check, para push, para check,...

        #- Lift up the split-contig .out's to contig-level .out's
        cd ~/oo
        foreach d ( ?{,?}/NT_* )
          cd $d
          set contig = $d:t
          liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null
          cd ../..
        end


10) Lift up RepeatMask .out files to chromosome coordinates via
       cd ~/oo
       tcsh jkStuff/liftOut2.sh

10.1) Validate the RepeatMasking by randomly selecting a few NT_*.fa files, 
      manually repeat masking them and matching the .out files with the 
      related part in the chromosome-level .out files.  For example:

        ssh kk
        cd ~/oo
      Pick several values of $chr and $nt and run these commands: 
        set chr = ?
        set nt  = NT_??????
        mv $chr/$nt/$nt.fa.out $chr/$nt/$nt.fa.out.bak
        /scratch/hg/RepeatMasker/RepeatMasker -s $chr/$nt/$nt.fa
        rm $chr/$nt/$nt.fa.{masked,cat,cut,stderr,tbl}
      Compare each $chr/$nt/$nt.fa.out against the original and against 
      the appropriate part of $chr/chr$chr.fa.out (use the coords for 
      $nt given in seq_contig.md).  
        mv $chr/$nt/$nt.fa.out.bak $chr/$nt/$nt.fa.out

     For build 31, the following were checked:
     <1/NT_004321, Y/NT_025975, 11/NT_033237> (TODO)

11) Generate contig and chromosome level masked and unmasked files via:
       tcsh jkStuff/chrFa.sh
       tcsh jkStuff/makeFaMasked.sh

12) Copy all contig and chrom fa files to /scratch on kkstore to get ready for
    cluster jobs, and ask to propagate to nodes

        ssh kkstore
	cd ~/oo
        /cluster/bin/scripts/cpNcbiFaScratch . /scratch/hg/gs.14/build31

13) Create jkStuff/ncbi.lft for lifting stuff built w/NCBI assembly.
    Note: this ncbi.lift will not lift floating contigs to chr_random coords,
    but it will show the strand orientation of the floating contigs 
    (grep for '|').
        mdToNcbiLift seq_contig.md jkStuff/ncbi.lft 
    If a lift file has been edited (e.g. as in 6.2.5 above), edit ncbi.lft 
    to match.

CREATING DATABASE  (DONE 12/03/02 - MATT)

o - ln -s /cluster/store4/gs.14/build31 ~/oo
o - Make sure there is at least 5 gig free on hgwdev:/var/lib/mysql
o - Create the database.
     - ssh hgwdev
     - Enter mysql as the mysql root user.
     - At mysql prompt type:
        create database hg13;
        quit
     - make a semi-permanent read-only alias:
        alias hg13 mysql -u hguser -phguserstuff -A hg13
o - Tell the hgCentral database about it.  Log onto genome-centdb
    and enter mysql via
        mysql -u root -pbigSecret hgCentral
    At the mysql prompt type:
       insert into dbDb values("hg13", "Human Nov. 2002", "/cluster/store4/gs.14/build31/nib", "Human", "USP18", 1, 50, "Human");
o - Create the trackDb table as so
       cd ~/src/hg/makeDb/hgTrackDb
    Edit that makefile to add hg13 after hg12 and do
       make update
       cvs commit makefile


LOAD REPEAT MASKS (DONE 12/4/02)
    Load the RepeatMasker .out files into the database with:
       cd ~/oo
       hgLoadOut hg13 ?/*.fa.out ??/*.fa.out


MAKE LIFTALL.LFT (DONE 12/4/02)

    cd ~/oo
    cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft


STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE 12/18/02 - MATT)

Create packed chromosome sequence files 
     ssh eieio
     cd ~/oo
     tcsh jkStuff/makeNib.sh 

Load chromosome sequence info into database and save chrom sizes
     ssh hgwdev
     mkdir -p /gbdb/hg13/nib
     cd /gbdb/hg13/nib
     foreach f (/cluster/store4/gs.14/build31/nib/chr*.nib)
       ln -s $f
     end
     hgsql hg13 < ~/src/hg/lib/chromInfo.sql
     cd ~/oo
     hgNibSeq -preMadeNib hg13 /gbdb/hg13/nib ?{,?}/chr*.fa
     echo "select chrom,size from chromInfo" | hgsql -N hg13 > chrom.sizes

Store o+o info in database.
Note: for build31, Terry specially requested these files from NCBI:
  finished.finf
  draft.finf
  predraft.finf
  extras.finf

     cd /cluster/store4/gs.14/build31
     jkStuff/liftGl.sh contig.gl
     hgGoldGapGl hg13 /cluster/store4/gs.14 build31 
     cd /cluster/store4/gs.14
     hgClonePos hg13 build31 ffa/sequence.inf /cluster/store4/gs.14 -maxErr=3
#(Ignore warnings about missing clones - these are in chromosomes 21 and 22)
     hgCtgPos hg13 build31 

Make and load GC percent table
     ssh hgwdev
     cd ~/oo
     mkdir -p bed/gcPercent
     cd bed/gcPercent
     hgsql hg13  < ~/src/hg/lib/gcPercent.sql
     hgGcPercent hg13 ../../nib


GETTING FRESH mRNA, EST, REFSEQ SEQUENCE FROM GENBANK. (DONE 12/03/02 - MATT)

This will create a genbank.132 directory containing compressed
GenBank flat files and a mrna.132 containing unpacked sequence
info and auxiliary info in a relatively easy to parse (.ra) 
format.

  o - Point your browser to ftp://ftp.ncbi.nih.gov/genbank and
      look at the README.genbank.  Figure out the current release number
      (which is 132).
  o - Consider deleting one of the older genbank releases.  It''s
      good to at least keep one previous release though.
  o - Where there is space make a new genbank directory.  Create a
      symbolic link to it:
          mkdir /cluster/store1/genbank.132
          ln -s /cluster/store1/genbank.132 ~/genbank
      cd ~/genbank
  o - ftp ftp.ncbi.nih.gov  (do anonymous log-in).  Then do the
      following commands inside ftp:
           cd genbank
           prompt
           mget gbpri* gbrod* gbv* gbsts* gbest* gbmam* gbinv*
      This will take at least 2 hours.
  o - Make the refSeq subdir and download files:
       mkdir -p /cluster/store1/mrna.132/refSeq
       cd /cluster/store1/mrna.132/refSeq
  o - ftp ftp.ncbi.nih.gov  (do anonymous log-in).  Then do the
      following commands inside ftp:
           cd refseq/H_sapiens/mRNA_Prot
           prompt
           mget hs.*.gz
  o - Unpack this into fa files and get extra info with:
      cd /cluster/store1/mrna.132/refSeq
      gunzip -c hs.gbff.gz  | \
        gbToFaRa ~kent/hg/h/mrna.fil ../refSeq.fa ../refSeq.ra ../refSeq.ta \
        stdin
  o - cd /cluster/store2/mrna.132

# Make the RNAs
        gunzip -c /cluster/store2/genbank.132/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \
        | gbToFaRa -byOrganism=org anyRna.fil mrna.fa mrna.ra mrna.ta stdin

# Make the ESTs
        gunzip -c /cluster/store2/genbank.132/gbest*.gz | \
        gbToFaRa anyRna.fil est.fa est.ra est.ta stdin -byOrganism=org

# Make the nonhuman RNAs
        gunzip -c /cluster/store2/genbank.132/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \
        | gbToFaRa humanXenoRna.fil xenoRna.fa xenoRna.ra xenoRna.ta stdin

# Make the nonhuman ESTs
        gunzip -c /cluster/store2/genbank.132/gbest*.gz | \
        gbToFaRa humanXenoRna.fil xenoEst.fa xenoEst.ra xenoEst.ta stdin


STORING mRNA/EST SEQUENCE AND AUXILIARY INFO (DONE 12/04/02 - MATT)

o - Make /gbdb symlinks
     cd /gbdb
     mkdir hg13
     cd hg13
     mkdir mrna.132
     cd mrna.132
     ln -s /cluster/store2/mrna.132/org/Homo_sapiens/mrna.fa mrna.fa
     ln -s /cluster/store2/mrna.132/org/Homo_sapiens/mrna.ra mrna.ra
     ln -s /cluster/store2/mrna.132/org/Homo_sapiens/est.fa est.fa
     ln -s /cluster/store2/mrna.132/org/Homo_sapiens/est.ra est.ra
     ln -s /cluster/store1/refSeqCum/112602/org/Homo_sapiens/refSeq.fa refSeq.fa
     ln -s /cluster/store1/refSeqCum/112602/org/Homo_sapiens/refSeq.ra refSeq.ra
     
STORE MRNA (DONE 12/04/02 - MATT)

o - Store the mRNA (non-alignment) info in database.
     hgLoadRna new hg13
     hgLoadRna add -type=mRNA hg13 /gbdb/hg13/mrna.132/mrna.fa /gbdb/hg13/mrna.132/mrna.ra
     hgLoadRna add -type=EST hg13 /gbdb/hg13/mrna.132/est.fa /gbdb/hg13/mrna.132/est.ra

    The est line will take quite some time to complete.


MAKING AND STORING mRNA AND EST ALIGNMENTS (DONE 12/09/02 - MATT)

o - Load up the local disks of the cluster with refSeq.fa, mrna.fa and est.fa
    Copy the above 3 files from /cluster/store1/mrna.132 into 
    kkstore:/scratch/hg/mrna.132
    Request the admins to do a binrsync to the cluster.

o - Use BLAT to generate refSeq, mRNA and EST alignments as so:
      Make sure that /scratch/hg/gs.14/build31/contig/ is loaded
      with NT_*.fa and pushed to the cluster nodes.

          ssh kk

          mkdir -p /cluster/store4/gs.14/build31/bed
          cd /cluster/store4/gs.14/build31/bed

Using the bash shell do:
          for i in 'refSeq' 'mrna' 'est'
          do
              mkdir -p $i
              cd $i
              cp ~kent/lastOo/bed/$i/gsub .
              ls -1S /scratch/hg/gs.14/build31/trfFa.1204/*.fa.trf > genome.lst
              ls -1 /scratch/hg/mrna.132/Homo_sapiens/$i.fa > mrna.lst
              mkdir -p psl
              gensub2 genome.lst mrna.lst gsub spec
              para create spec
              cd ..
          done 

    Now, by hand cd to the mrna, refSeq, and est directories respectively
    and run a para push and para check in each one.

o - Process refSeq mRNA and EST alignments into near best in genome.
      cd ~/oo/bed
      cd refSeq
      pslSort dirs raw.psl /tmp psl
      pslReps -minCover=0.2 -sizeMatters -minAli=0.98 -nearTop=0.002 raw.psl contig.psl /dev/null
      liftUp -nohead all_refSeq.psl ../../jkStuff/liftAll.lft carry contig.psl
      pslSortAcc nohead chrom /tmp all_refSeq.psl
      cd .. 

      cd mrna
      pslSort dirs raw.psl /tmp psl
      #pslReps -minAli=0.96 -nearTop=0.01 raw.psl contig.psl /dev/null
      pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl /dev/null
      liftUp -nohead all_mrna.psl ../../jkStuff/liftAll.lft carry contig.psl
      pslSortAcc nohead chrom /tmp all_mrna.psl
      cd ..

      cd est
      pslSort dirs raw.psl /cluster/store3/tmp psl
      #pslReps -minAli=0.93 -nearTop=0.01 raw.psl contig.psl /dev/null
      pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl /dev/null
      liftUp -nohead all_est.psl ../../jkStuff/liftAll.lft carry contig.psl
      pslSortAcc nohead chrom /cluster/store3/tmp all_est.psl
      cd ..

o - Load refSeq alignments into database
      ssh hgwdev
      cd /cluster/store4/gs.14/build31/bed/refSeq
      pslCat -dir chrom > refSeqAli.psl
      hgLoadPsl hg13 -tNameIx refSeqAli.psl

o - Load mRNA alignments into database. 
      ssh hgwdev
      cd /cluster/store4/gs.14/build31/bed/mrna/chrom
In tcsh:
      rm *_mrna.psl
      foreach i (*.psl)
          mv $i $i:r_mrna.psl
      end
      hgLoadPsl hg13 *.psl
      cd ..
      hgLoadPsl hg13 all_mrna.psl -nobin

o - Load EST alignments into database.
      ssh hgwdev
      cd /cluster/store4/gs.14/build31/bed/est/chrom
in tcsh do:
      rm *_est.psl
      foreach i (*.psl)
            mv $i $i:r_est.psl
      end
      hgLoadPsl hg13 *.psl
      cd ..
      hgLoadPsl hg13 all_est.psl -nobin

o - Create subset of ESTs with introns and load into database.
      - ssh eieio
      cd ~/oo
      tcsh jkStuff/makeIntronEst.sh
      - ssh hgwdev
      cd ~/oo/bed/est/intronEst
      hgLoadPsl hg13 *.psl

o - Put orientation info on ESTs and mRNAs into database:
    Note: the cluster run requires /scratch/.../trfFa.1204/ to be in place,
    so this step should be run after "PREPARING SEQUENCE FOR CROSS SPECIES 
    ALIGNMENTS" below.
     ssh kk
     cd ~/oo/bed/est
     pslSortAcc nohead contig /cluster/store3/tmp contig.psl
     cd ~/oo/bed/mrna
     pslSortAcc nohead contig /cluster/store3/tmp contig.psl
     ssh kkstore
     mkdir -p /scratch/hg/gs.14/build31/bed
     cp -r ~/oo/bed/est/contig /scratch/hg/gs.14/build31/bed/est
     cp -r ~/oo/bed/mrna/contig /scratch/hg/gs.14/build31/bed/mrna

    Ask admins to binrsync /scratch/hg/gs.14/build31/bed/* to the cluster.
     ssh kk
     foreach d (est mrna)
       mkdir -p ~/oo/bed/${d}OrientInfo/oi
       cd ~/oo/bed/${d}OrientInfo
       ls -1 /scratch/hg/gs.14/build31/bed/${d}/*.psl > psl.lst
       cp ~/hg12/bed/${d}OrientInfo/gsub .
     end
    Edit ~/oo/bed/${d}OrientInfo/gsub to point to the correct paths.
    For each of ~/oo/bed/{est,mrna}OrientInfo, cd there and do this:
     gensub2 psl.lst single gsub spec
     para create spec
     para try 
     para check      
     para push
   check until done, or use 'para shove'

ssh hgwdev
   When the cluster run is done do:
     foreach d (est mrna)
       cd ~/oo/bed/${d}OrientInfo
       liftUp ${d}OrientInfo.bed ~/oo/jkStuff/liftAll.lft warn oi/*.tab
       hgLoadBed hg13 ${d}OrientInfo ${d}OrientInfo.bed \
         -sqlTable=$HOME/src/hg/lib/${d}OrientInfo.sql > /dev/null
     end

o - Create rnaCluster table (depends on {est,mrna}OrientInfo above)
   ssh hgwdev
   cd ~/oo
   # Create a list of accessions that come from RAGE libraries and need to
   # be excluded. (added by Chuck Wed Nov 27 13:09:07 PST 2002)
   kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg13 hg13.rage.libs	
   mkdir -p ~/oo/bed/rnaCluster/chrom
   foreach i (? ??)
       cd $i
       foreach j (chr*.fa)
           set c = $j:r
           set f = ../bed/rnaCluster/chrom/$c.bed
           # Exclude accesions in the RAGE file up one directory
           echo clusterRna -mrnaExclude=../hg13.rage.libs hg13 /dev/null $f -chrom=$c
           clusterRna -mrnaExclude=../hg13.rage.libs  hg13 /dev/null $f -chrom=$c
       end
       cd ..
   end

   cd bed/rnaCluster
   hgLoadBed hg13 rnaCluster chrom/*.bed > /dev/null

PRODUCING KNOWN GENES (DONE 11/26/02 - MATT)

o - Download everything from ftp://ftp.ncbi.nih.gov/refseq/cumulative
    into /cluster/store1/refSeqCum/112602
        cd /cluster/store1/refSeqCum/112602
        uncompress rscu.gbff.Z
        Put the following lines into anyRna.fil
        restrict mol=mRNA
        hide mol

        gbToFaRa anyRna.fil refSeq.fa refSeq.ra refSeq.ta -byOrganism=org rscu.gbff
        This will create fasta and annotations for the human in refSeq/org/Homo_sapiens
        in refSeq.fa and refSeq.ra under the org/Homo_sapiens directory
o - Get extra info from NCBI and produce refGene table as so:
       wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/loc2ref
       wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/mim2loc

o -   Load the refSeq mRNA
      hgLoadRna add -type=refSeq hg13 /gbdb/hg13/mrna.132/refSeq.fa /gbdb/hg13/mrna.132/refSeq.ra

o - Produce refGene, refPep, refMrna, and refLink tables as so:
Get the human proteins:
        wget ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/hs.faa.gz
        gunzip hs.faa.gz
        cd ~/oo/bed/refSeq
       hgRefSeqMrna hg13 /cluster/store1/refSeqCum/112602/org/Homo_sapiens/refSeq.fa \
        /cluster/store1/refSeqCum/112602/org/Homo_sapiens/refSeq.ra \
        all_refSeq.psl /cluster/store1/refSeqCum/112602/loc2ref \
        /cluster/store1/refSeqCum/112602/hs.faa \
        /cluster/store1/refSeqCum/112602/mim2loc
# Don't worry about the "No gene name" errors
o - Add RefSeq status info
    hgRefSeqStatus -human hg13 /cluster/store1/refSeqCum/112602/loc2ref

o - create precomputed join of refFlat and refGene:
      echo 'CREATE TABLE refFlat (KEY geneName (geneName), KEY name (name), KEY chrom (chrom)) SELECT refLink.name as geneNam\
e, refGene.* FROM refLink,refGene WHERE refLink.mrnaAcc = refGene.name' | hgsql hg13

o - Add Jackson labs info
     cd ~/oo/bed
     mkdir jaxOrtholog
     cd jaxOrtholog
     wget ftp://ftp.informatics.jax.org/pub/informatics/reports/HMD_Human3.rpt
     cp /cluster/store1/gs.12/build29/bed/jaxOrtholog/filter.awk .
     awk -f filter.awk *.rpt > jaxOrtholog.tab
    Drop (just in case), create and load the table like this:
     echo 'drop table jaxOrtholog;' | hgsql hg13
     hgsql hg13 < ~/src/hg/lib/jaxOrtholog.sql
     echo "load data local infile '"`pwd`"/jaxOrtholog.tab' into table jaxOrtholog;" \
       | hgsql hg13

REFFLAT and GENEBANDS

o - create precomputed join of refFlat and refGene:
      echo 'CREATE TABLE refFlat (KEY geneName (geneName), KEY name (name), KEY chrom (chrom)) SELECT refLink.name as geneName, refGene.* FROM refLink,refGene WHERE refLink.mrnaAcc = refGene.name' | hgsql hg13

DONE (MATT 2/03/03)
o - Create precomputed geneBands table:
      ssh hgwdev
      hgGeneBands hg13 geneBands.txt
      create the geneBands table from kent/src/hg/lib/geneBands.sql
      hgsql hg13
      >mysql load data local infile 'geneBands.txt' into table geneBands;
      >mysql quit
      rm geneBands.txt

SIMPLE REPEAT TRACK (DONE 12/4/02)

    # Create cluster parasol job like so:
    ssh kk
    mkdir -p ~/oo/bed/simpleRepeat
    cd ~/oo/bed/simpleRepeat
    cp ~/lastOo/bed/simpleRepeat/gsub .
    mkdir trf

    # Ask the admins to push /scratch/hg/gs.14/build31/ to the cluster
    ls -1S /scratch/hg/gs.14/build31/contig/*.fa > genome.lst
    echo "" > dummy.list
    gensub2 genome.lst dummy.list gsub spec
    para create spec
    para try
    para check
    para push
    liftUp simpleRepeat.bed ~/oo/jkStuff/liftAll.lft warn trf/*.bed

    # Load into the database:
    ssh hgwdev
    cd ~/oo/bed/simpleRepeat
    hgLoadBed hg13 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql


PREPARING SEQUENCE FOR CROSS SPECIES ALIGNMENTS (DONE 12/4/02)

    # After the simpleRepeats track has been built, make a filtered version 
    # of the trf output: keep trf's with period <= 12:
    ssh eieio
    cd ~/hg13/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/NT_*.bed)
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
    # Lift up filtered trf output to chrom coords as well:
    cd ~/hg13
    mkdir -p bed/simpleRepeat/trfMaskChrom
    foreach c (?{,?})
      if (-e $c/lift/ordered.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
          $c/lift/ordered.lst > $c/lift/oTrf.lst
      endif
      if (-e $c/lift/random.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/random.lst > $c/lift/rTrf.lst
      endif
      if (-e $c/lift/oTrf.lst) then
        liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
      endif
      if (-e $c/lift/rTrf.lst) then
        liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
      endif
    end
    # Make a trfMasked version of the contigs:
    cd ~/hg13
    mkdir trfFa
    foreach f (?{,?}/NT_*/NT_??????.fa)
      set c = $f:t:r
      maskOutFa -softAdd $f bed/simpleRepeat/trfMask/$c.bed trfFa/$c.fa.trf
    end
    # Then make sure there is enough space available on /scratch and do
    ssh kkstore
    cp -Rp ~/hg13/trfFa /scratch/hg/gs.14/build31/trfFa.1204
    # ask for binrsync, or sudo /cluster/install/utilities/updateLocal

MAKE DOWNLOADABLE SEQUENCE FILES (DONE 12/4/02)

    # This used to be done right after RepeatMasking.  Now, we mask with 
    # TRF as well, so do this after the "PREPARING SEQUENCE" step above.
    ssh eieio
    cd ~/hg13
    #- Soft-mask (lower-case) the contig and chr .fa's
    ./jkStuff/makeFaTrfMasked.sh
    #- Make hard-masked .fa.masked files as well:
    ./jkStuff/makeHardMasked.sh
    #- Rebuild the nib, mixedNib, maskedNib files:
    ./jkStuff/makeNib.sh
    #- Rebuild the .zip files
    ./jkStuff/zipAll.sh |& tee zipAll.log
    #- Look at zipAll.log to make sure all file lists look reasonable.  
    #- Check zip file integrity:
    foreach f (*.zip)
      unzip -t $f > $f.test
      tail -1 $f.test
    end
    wc -l *.zip.test

    #- Copy the .zip files to hgwdev:/usr/local/apache/...
    ssh hgwdev
    cd ~/hg13
    ./jkStuff/cpToWeb.sh
    cd /usr/local/apache/htdocs/goldenPath/14nov2002
    #- Take a look at bigZips/* and chromosomes/*, update their README.txt's


PREPARE CLUSTER FOR BLASTZ RUN (DONE 12/4/02)

    # This needs to be done after trf-masking and nib generation.
    ssh eieio
    # Extract lineage-specific repeats using Arian Smit's script:
    mkdir -p ~/hg13/bed/linSpecRep
    cd ~/hg13/bed/linSpecRep
    foreach f (~/hg13/*/*.out)
        ln -sf $f .
    end
    /cluster/bin/scripts/rodentSpecificRepeats.pl *.out
    /cluster/bin/scripts/perl-rename 's/(\.fa|\.nib)//' *.out.*spec
    /cluster/bin/scripts/perl-rename 's/\.(rod|prim)spec/.spec/' *.out.*spec
    rm *.out
    # Copy files to the kkstore:/scratch
    ssh kkstore
    # lineage-specific repeats:
    cd ~/hg13/bed
    mkdir -p /scratch/hg/gs.14/build31
    rm -rf /scratch/hg/gs.14/build31/linSpecRep
    cp -Rp linSpecRep /scratch/hg/gs.14/build31
    # RepeatMasker .out:
    cd ~/hg13
    rm -rf /scratch/hg/gs.14/build31/rmsk
    mkdir -p /scratch/hg/gs.14/build31/rmsk
    cp -p ?{,?}/chr?{,?}{,_random}.fa.out /scratch/hg/gs.14/build31/rmsk
    # Chrom-level mixed nibs that have been repeat- and trf-masked:
    rm -rf /scratch/hg/gs.14/build31/chromTrfMixedNib
    mkdir -p /scratch/hg/gs.14/build31/chromTrfMixedNib
    cp -p mixedNib/chr*.nib /scratch/hg/gs.14/build31/chromTrfMixedNib
    # Ask cluster-admin@cse.ucsc.edu to binrsync /scratch/hg to clusters

    # Jim's comments Feb 12 '03 about the order in which to run blastz:
    # In general we should do
    # 1) hg/mm
    # 2) mm/rn
    # 3) rn/hg
    # 4) hg/hg
    # 5) mm/mm
    # 6) rn/rn
    # There is now an 'axtSwap' program that might let us
    # get out of having to run the inverse of 1,2 & 3,  though
    # 2 in particular is so fast perhaps it's just as well to
    # do the inverse explicitly.


PRODUCING GENSCAN PREDICTIONS (DONE 12/5/02)

    ssh eieio
    mkdir -p ~/oo/bed/genscan
    cd ~/oo/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir -p gtf pep subopt
    # Generate a list file, genome.list, of all the contigs
    # *that do not have pure Ns* (due to heterochromatin, unsequencable 
    # stuff) which would cause genscan to run forever.
    rm -f genome.list
    touch genome.list
    foreach f ( `ls -1S /cluster/store4/gs.14/build31/?{,?}/NT_*/NT_??????.fa.masked` )
      egrep '[ACGT]' $f > /dev/null
      if ($status == 0) echo $f >> genome.list
    end
        
    # Log into kkr1u00 (not kk!).  kkr1u00 is the driver node for the small
    # cluster (kkr2u00 -kkr8u00. Genscan has problem running on the
    # big cluster, due to limitation of memory and swap space on each
    # processing node).
    ssh kkr1u00
    # Create template file, gsub, for gensub2.  For example (3-line file):
#LOOP    rm -f genome.list
/cluster/home/kent/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
    echo "" > dummy.list
    gensub2 genome.list dummy.list gsub jobList
    para create jobList
    para try
    para check
    para push
    # Issue either one of the following two commands to check the
    # status of the cluster and your jobs, until they are done.
    parasol status
    para check
    # If there were out-of-memory problems (run "para problems"), then 
    # re-run those jobs by hand but change the -window arg from 2400000
    # to 1200000.  In build31, these were 22/NT_011519 and 19/NT_011109.

    # Convert these to chromosome level files as so:     
    ssh eieio
    cd ~/oo/bed/genscan
    liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/NT*.gtf
    liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/NT*.bed > \
      /dev/null
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd ~/oo/bed/genscan
    ldHgGene hg13 genscan genscan.gtf
    hgPepPred hg13 generic genscanPep genscan.pep
    hgLoadBed hg13 genscanSubopt genscanSubopt.bed > /dev/null


LOAD CPGISLANDS (DONE 12/4/02)
     ssh eieio
     mkdir -p ~/oo/bed/cpgIsland
     cd ~/oo/bed/cpgIsland
     # Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu)
     # copy the tar file to the current directory
     tar xvf cpg_dist.tar 
     cd cpg_dist
     gcc readseq.c cpg_lh.c -o cpglh.exe
     cd ..
     # cpglh.exe requires hard-masked (N) .fa's.  
     # There may be warnings about "bad character" for IUPAC ambiguous 
     # characters like R, S, etc.  Ignore the warnings.  
     foreach f (../../?{,?}/chr?{,?}{,_random}.fa.masked)
       set fout=$f:t:r:r.cpg
       ./cpg_dist/cpglh.exe $f > $fout
       echo Done with $fout
     end
     cp ~/lastOo/bed/cpgIsland/filter.awk .
     awk -f filter.awk chr*.cpg > cpgIsland.bed
     ssh hgwdev
     cd ~/oo/bed/cpgIsland
     hgLoadBed hg13 cpgIsland -tab -noBin \
       -sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed


CREATE GOLDEN TRIANGLE (todo)

# Make sure that rnaCluster table is in place.   Then extract Affy 
# expression info into a form suitable for Eisen's clustering program with:
      cd ~/oo/bed
      mkdir triangle
      cd triangle
      eisenInput hg13 affyHg10.txt
Transfer this to Windows and do k-means clustering
with k=200 with cluster.  Transfer results file back
to ~/oo/bed/triangle/affyCluster_K_G200.kgg.  Then
do
      promoSeqFromCluster hg13 1000 affyCluster_K_G200.kgg kg200.unmasked
Then RepeatMask the .fa file inkg200.unmasked, and copy masked versions
to kg200.   Then
      cat kg200/*.fa > all1000.fa
and set up cluster Improbizer run to do 100 controls for every real
run on each - putting the output in imp.200.1000.e.  When improbizer
run is done make a file summarizing the runs as so:
      cd imp.200.1000.e
      motifSig ../imp.200.1000.e.iri ../kg200 motif control*
get rid of insignificant motifs with:
      cd ..
      awk '{if ($2 > $3) print; }' imp.200.1000.e.iri > sig.200.1000.e.iri
turn rest into just dnaMotifs with
      iriToDnaMotif sig.200.1000.e.iri motif.200.1000.e.txt
Extract all promoters with
      featureBits hg13 rnaCluster:upstream:1000 -bed=upstream1000.bed -fa=upstream1000.fa
Locate motifs on all promoters with
      dnaMotifFind motif.200.1000.e.txt upstream1000.fa hits.200.1000.e.txt -rc -markov=2
      liftPromoHits upstream1000.bed hits.200.1000.e.txt triangle.bed

CREATE STS/FISH/BACENDS/CYTOBANDS DIRECTORY STRUCTURE AND SETUP (DONE - TSF)

o - Create directory structure to hold information for these tracks
	cd /projects/hg2/booch/psl/
	change Makefile parameters for OOVERS, GSVERS, PREVGS, PREVOO
	make new

o - Update all Makefiles with latest OOVERS and GSVERS, DATABASE, and locations of .fa files

o - Create accession_info file
	make accession_info.rdb

UPDATE STS INFORMATION (DONE - TSF 12/2002)

o - Download and unpack updated information from dbSTS:

	In a web browser, go to ftp://ftp.ncbi.nih.gov/repository/dbSTS/.  Download 
    	dbSTS.sts, dbSTS.aliases, and dbSTS.FASTA.dailydump.Z to 
    	/projects/hg2/booch/psl/update

	-Unpack dbSTS.FASTA.dailydump.Z
	gunzip dbSTS.FASTA.dailydump.Z

o - Create updated files
	cd /projects/hg2/booch/psl/update
	edit Makefile to latest sts.X version from PREV (currently sts.4)
	make update

o - Make new directory for this info and move files there
	ssh kks00
	mkdir /cluster/store1/sts.5
	cp all.STS.fa /cluster/store1/sts.5
	cp all.primers /cluster/store1/sts.5
	cp all.primers.fa /cluster/store1/sts.5

o - Copy new files to cluster
	ssh kkstore
	cd /cluster/store1/sts.5
	cp /cluster/store1/sts.5/*.* /scratch/hg/STS
	ask for propagation from sysadmin

STS ALIGNMENTS (DONE - TSF 12/2002)
(alignments done without RepeatMasking, so start ASAP!)

o - Create full sequence alignments
	ssh kk

	cd /cluster/home/booch/sts
	- update Makefile with latest OOVERS and GSVERS
	make new
	make jobList.scratch (if contig files propagated to nodes)
		- or _
	make jobList.disk (if contig files not propagated)
	para create jobList
	para push (or para try/para check if want to make sure it runs)
	make stsMarkers.psl

o - Copy files to final destination and remove originals
	ssh kks00
	make copy.assembly
	make clean.assembly

o - Create primer alignments
	ssh kk
	cd /cluster/home/booch/primers
	- update Makefile with latest OOVERS and GSVERS
	make new
	make jobList.scratch (if contig files propagated to nodes)
		- or _
	make jobList.disk (if contig files not propagated)
	para create jobList
	para push (or para try/para check if want to make sure it runs)
	make primers.psl

o - Copy files to final destination and remove
	ssh kks00
	make copy.assembly
	make clean.assembly
	
o - Create ePCR alignments
	ssh kk
	cd /cluster/home/booch/epcr
	- update Makefile with latest OOVERS and GSVERS
	make new
	make jobList.scratch (if contig files propagated to nodes)
		- or _
	make jobList.disk (if contig files not propagated)
	para create jobList
	para push (or para try/para check if want to make sure it runs)
	make primers.psl

o - Copy files to final destination and remove
	ssh kks00
	make copy.assembly
	make clean.assembly
	
CREATE AND LOAD STS MARKERS TRACK (DONE - TSF 12/2002)

o - Copy in current stsInfo2.bed and stsAlias.bed files
	cd /projects/hg2/booch/psl/gs.14/build31
	cp ../update/stsInfo2.bed .
	cp ../update/stsAlias.bed .

o - Create final version of sts sequence placements
	ssh kks00
	cd /projects/hg2/booch/psl/gs.14/build31/sts
	make stsMarkers.final

o - Create final version of primers placements
	cd /projects/hg2/booch/psl/gs.14/build31/primers
	cp /cluster/store1/sts.5/all.primers .
	make primers.final

o - Create bed file
	cd /projects/hg2/booch/psl/gs.14/build31
	make stsMap.bed

o - Create database tables
	ssh hgwdev
	cd /projects/hg2/booch/psl/tables
	mysql -uhgcat -pXXXXXXX hg13 < all_sts_primer.sql
	mysql -uhgcat -pXXXXXXX hg13 < all_sts_seq.sql
	mysql -uhgcat -pXXXXXXX hg13 < stsAlias.sql
	mysql -uhgcat -pXXXXXXX hg13 < stsInfo2.sql
	mysql -uhgcat -pXXXXXXX hg13 < stsMap.sql

o - Load the tables
	load /projects/hg2/booch/psl/gs.14/build31/sts/stsMarkers.psl.filter.lifted into all_sts_seq	
	load /projects/hg2/booch/psl/gs.14/build31/primers/primers.psl.filter.lifted into all_sts_primer	
	load /projects/hg2/booch/psl/gs.14/build31/stsAlias.bed into stsAlias
	load /projects/hg2/booch/psl/gs.14/build31/stsInfo2.bed into stsInfo2
	load /projects/hg2/booch/psl/gs.14/build31/stsMap.bed into stsMap

o - Load the sequences (change sts.# to match correct location)
	hgLoadRna addSeq hg13 /cluster/store1/sts.6/all.STS.fa
	hgLoadRna addSeq hg13 /cluster/store1/sts.6/all.primers.fa


UPDATE BACEND SEQUENCES (DONE TSF 12/2002)

o - Download new files:

	In a web browser, go to ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/BACENDS/.  Download 
    	AllBACends.fa.gz and bacends.cl_acc_gi_len_primer to 
    	/cluster/store1/bacends.3

	-Unpack AllBACends.fa.gz
	gunzip AllBACends.fa.gz

o - Create new pairs file

o - Copy file to cluster
	ssh kkstore
	cd /cluster/store1/bacends.3
	cp /cluster/store1/bacends3/AllBACends.fa /scratch/hg/h/bacEnds/BACends.fa
	ask for propagation from sysadmin


BACEND SEQUENCE ALIGNMENTS (DONE - TSF 12/2002)
(alignments done without RepeatMasking, so start ASAP!)

o - Create full sequence alignments
	ssh kk
	cd /cluster/home/booch/bacends
	- update Makefile with latest OOVERS and GSVERS
	make new
	make jobList
	para create jobList
	para push (or para try/para check if want to make sure it runs)
	make bacEnds.psl

o - Copy files to final destination and remove
	ssh kks00
	make copy.assembly
	make clean.assembly

BACEND PAIRS TRACK (DONE - TSF 12/2002)

o - Update Makefile with location of pairs files, if necessary
	cd /projects/hg2/booch/psl/gs.14/build31/bacends
	edit Makefile (PAIRS=....)

o - Create bed file
	ssh kks00
	cd /projects/hg2/booch/psl/gs.14/build31/bacends
	make bacEndPairs.bed

o - Create database tables
	ssh hgwdev
	cd /projects/hg2/booch/psl/tables
	mysql -uhgcat -pXXXXXXX < all_bacends.sql
	mysql -uhgcat -pXXXXXXX < bacEndPairs.sql

o - Load the tables
	load /projects/hg2/booch/psl/gs.14/build31/bacends/bacEnds.psl.filter.lifted into all_bacends	
	load /projects/hg2/booch/psl/gs.14/build31/bacends/bacEndPairs.bed into bacEndPairs

o - Load the sequences (change bacends.# to match correct location)
	hgLoadRna addSeq hg13 /cluster/store1/bacends.2/BACends.fa
		
FOSEND SEQUENCE ALIGNMENTS (DONE - TSF 12/2002)

o - Create full sequence alignments
	ssh kk
	cd /cluster/home/booch/fosends
	- update Makefile with latest OOVERS and GSVERS
	make new
	make jobList
	para create jobList
	para push (or para try/para check if want to make sure it runs)
	make fosEnds.psl

o - Copy files to final destination and remove
	ssh kks00
	make copy.assembly
	make clean.assembly

FOSEND PAIRS TRACK (DONE - TSF 12/2002)

o - Update Makefile with location of pairs files, if necessary
	cd /projects/hg2/booch/psl/gs.14/build31/fosends

o - Create bed file
	ssh kks00
	cd /projects/hg2/booch/psl/gs.14/build31/fosends
	make fosEndPairs.bed

o - Create database tables
	ssh hgwdev
	cd /projects/hg2/booch/psl/tables
	mysql -uhgcat -pXXXXXXX < all_fosends.sql
	mysql -uhgcat -pXXXXXXX < fosEndPairs.sql

o - Load the tables
	load /projects/hg2/booch/psl/gs.14/build31/fosends/fosEnds.psl.filter.lifted into all_fosends	
	load /projects/hg2/booch/psl/gs.14/build31/fosends/fosEndPairs.bed into fosEndPairs

o - Load the sequences (change bacends.# to match correct location)
	hgLoadRna addSeq hg13 /cluster/store1/fosends.1/fosEnds.fa
		
UPDATE FISH CLONES INFORMATION (DONE - TSF 12/2002)

o - Download the latest info from NCBI
	point browser at http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg
	change "Show details on sequence-tag" to "yes"
	change "Download or Display" to "Download table for UNIX"
	press Submit - save as /projects/hg2/booch/psl/fish/hbrc/hbrc.YYYYMMDD.table

o - Format file just downloaded
	cd /projects/hg2/booch/psl/fish/
	make HBRC

o - Copy it to the new freeze location
	cp /projects/hg2/booch/psl/fish/all.fish.format /projects/hg2/booch/psl/gs.14/build31/fish/


CREATE AND LOAD FISH CLONES TRACK (TO DO)
(must be done after STS markers track and BAC end pairs track)

o - Extract the file with clone positions from database
	ssh hgwdev
	mysql -uhgcat -pXXXXXXXX hg13
	mysql>  select * into outfile "/tmp/booch/clonePos.txt" from clonePos;
	mysql> quit
	mv /tmp/booch/clonePos.txt /projects/hg2/booch/psl/gs.14/build31/fish

o - Create bed file
	cd /projects/hg2/booch/psl/gs.14/build31/fish
	make bed

o - Create database table
	ssh hgwdev
	cd /projects/hg2/booch/psl/tables
	mysql -uhgcat -pXXXXXXX < fishClones.sql

o - Load the table
	load /projects/hg2/booch/psl/gs.14/build31/fish/fishClones.bed into fishClones
	

CREATE AND LOAD CHROMOSOME BANDS TRACK (TO DO)
(must be done after FISH Clones track) 

o - Create bed file
	ssh hgwdev
	make setBands.txt
	make cytobands.pct.ranges
	make predict

o - Create database table
	ssh hgwdev
	cd /projects/hg2/booch/psl/tables
	mysql -uhgcat -pXXXXXXX < cytoBand.sql
	

o - Load the table
	load /projects/hg2/booch/psl/gs.14/build31/cytobands/cytobands.bed into cytoBand


CREATE CHROMOSOME REPORTS (TO DO)


CREATE STS MAP COMPARISON PLOTS AND GENETIC PLOTS (DONE - TSF 12/2002)

o - Must wait until after the STS Map track has been finished

o - Create sts plots
	cd /projects/hg2/booch/psl/gs.14/build31/stsPlots
	make stsplots

o - Create genetic plots
	cd /projects/hg2/booch/psl/gs.14/build31/geneticPlots
	make all
	matlab -nodesktop
        >> allplot_ncbi('/cse/grads/booch/tracks/gs.14/build31/geneticPlots/','build31', 'jpg');
        >> quit

o - Set up directories where this will end up
	ssh hgwdev
	cd /usr/local/apache/htdocs/goldenPath/mapPlots
	update Makefile with OOVERS, GSVERS, and FREEZE date
	make new

o - Copy over files
	make sts
	make genetic

o - Update the index.html to include links to these new plots, and delete oldest set
    Update the arch.html with the oldest set just removed from index.html
	*** Make sure to check into CVS ***


HUMAN/MOUSE BLAT ALIGNMENTS (TO DO)

    # Process the trfFa files (contigs, lower-case repeat and tandem-repeat 
    # masked) into about 500 files containing records of <= 20kb each:
    # 
    # First, make .unlft (unlifted) versions of all mouse contig .agp's:
    ssh eieio
    cd ~/mm2
    foreach ctgAgp (?{,?}/chr*/chr?{,?}_?{,?}.agp)
      ~/kent/src/hg/splitFaIntoContigs/deLiftAgp.pl jkStuff/liftAll.lft \
        $ctgAgp > $ctgAgp.unlft
    end

    # Now use the unlifted contig .agp's to further split the (super-)contigs 
    # into smaller "sub"-contigs (still at contig boundaries):
    foreach ctgAgp (?{,?}/chr*/chr?{,?}_?{,?}.agp.unlft)
      set ctg=$ctgAgp:t:r:r
      splitFaIntoContigs $ctgAgp trfFa/$ctg.fa.trf trfFaSplit -nSize=15000
    end
    # Create a lift file for all sub-contigs.
    cat trfFaSplit/*/lift/ordered.lft > trfFaSplit/allSubContigs.lft

    # Since splitFaIntoContigs enforces a min/approximate size and we need 
    # to enforce a max size, use faSplit on sub-contigs.  Build up a list 
    # file naming all the split sub-contigs.  
    set splitSubDir=trfFaSplit/splitSubs
    mkdir -p $splitSubDir
    rm -f $splitSubDir/splitSubs.lst
    touch $splitSubDir/splitSubs.lst
    foreach ctgDir (trfFaSplit/?{,?}_?{,?})
      foreach subCtgFa ($ctgDir/chr*/chr*.fa)
        set subCtg=$subCtgFa:t:r
        faSplit size $subCtgFa 20000 $splitSubDir/${subCtg}_ \
          -lift=$splitSubDir/$subCtg.lft -maxN=20000
        foreach ss ($splitSubDir/${subCtg}_*)
          echo $ss >> $splitSubDir/splitSubs.lst
        end
      end
    end
    # Create a lift file for all split sub-contigs.
    # or not -- too many files for cat.  Create per-chunk lift files below.

    # Divide the list of split sub-contigs into ~500 chunks
    splitFile  $splitSubDir/splitSubs.lst 350 $splitSubDir/splitSubs_

    # cat the split-sub-contig .fa's into multi-record chunk .fa's 
    # for para job generation.  Make a lift file for each chunk.  
    set chunkDir = trfFaSplit/chunks
    mkdir -p $chunkDir
    rm -f /tmp/makeLft.log
    touch /tmp/makeLft.log
    foreach chunkLst ($splitSubDir/splitSubs_*)
      set chunkNum=`echo $chunkLst | sed -e 's/.*_//g'`
      set chunkLft = $chunkDir/chunk_$chunkNum.lft
      rm -f $chunkLft
      touch $chunkLft
      set lastSubCtg = ""
      foreach splitSubFa (`cat $chunkLst`)
        set splitSub = $splitSubFa:r
        set subCtg = `echo $splitSub | perl -wpe 's/(chr\w+_\d+_\d+)_\d+/$1/'`
        if ("$subCtg" != "$lastSubCtg") then
          echo "subCtg changed from $lastSubCtg to $subCtg; catting $subCtg.lft onto $chunkLft" >> /tmp/makeLft.log
          cat $subCtg.lft >> $chunkLft
          set lastSubCtg = $subCtg
        endif
      end
      cat `cat $chunkLst` > $chunkDir/chunk_$chunkNum.fa
    end
    # Put those files on cluster nodes' /scratch:
    mkdir /scratch/hg/mm2/splitContigChunks
    cp -p $chunkDir/chunk_* /scratch/hg/mm2/splitContigChunks
    # Ask sysadmins for an updateLocal/binrsync

    # Now we're ready to set up the cluster run!
    mkdir -p ~/oo/bed/blatMus
    cd ~/oo/bed/blatMus
    # Use the mouse multi-record chunks created above:
    ls -1 /scratch/hg/mm2/splitContigChunks/chunk_*.fa > mouse.lst
    # Then bundle up the human into pieces of less than 12 meg mostly:
    rm -f bigH smallH
    foreach f (/scratch/hg/gs.14/build31/trfFa.1204/*.fa.trf)
      set size = `ls -l $f | awk '{print $5;}'`
      if ($size < 13000000) then
        echo $f >> smallH
      else
        echo $f >> bigH
      endif
    end
    mkdir hs
    cd hs
    splitFile ../bigH 1 big
    rm bigXX # Note, this is just an empty file that the splitFile program erroneously created. Remove the last one.
    splitFile ../smallH 4 small
    rm smallXXX
    cd ..
    ls -1 hs/* > human.lst
    cat > gsub <<EOF
#LOOP
/cluster/home/kent/bin/i386/blat -qMask=lower -mask=lower -q=dnax -t=dnax {check in line+ \$(path1)} {check in line+ \$(path2)} {check out line+ psl/\$(root1)_\$(root2).psl}
#ENDLOOP
EOF
    gensub2 human.lst mouse.lst gsub spec

    # Do the cluster run
    ssh kk
    cd ~/oo/bed/blatMus
    mkdir psl
    para create spec
    para try, para check
    # Note: multiple "para push"es may be required, or 
    # "para push -maxQueue 200000 -maxPush 200000" or so.
    para push, para check...

    # Aggregate results and perform several kinds of coord-lifting:
    ssh eieio
    cd ~/oo/bed/blatMus
    # First from mouse split-sub-contigs to sub-contigs (individual .lft 
    # file for each chunk of split-sub-contigs):
    set chunkDir = ~/mm2/trfFaSplit/chunks
    mkdir pslLft
    rm -f /tmp/lft.log
    touch /tmp/lft.log
    foreach f (psl/*.psl)
      set chunk = `echo $f:t:r | perl -wpe 's/.*(chunk_\d+)/$1/'`
      liftUp -type=.psl -pslQ pslLft/$f:t $chunkDir/$chunk.lft warn $f \
        >>& /tmp/lft.log
    end
    # Then the kinds of lifting we can do all at once: 
    # mouse sub-contigs to mouse contigs, 
    # mouse contigs to mouse chrom, 
    # human contigs to human chrom:
    pslCat -dir -check pslLft \
      | liftUp -type=.psl -pslQ stdout ~/mm2/trfFaSplit/allSubContigs.lft \
        warn stdin \
      | liftUp -type=.psl -pslQ stdout ~/mm2/jkStuff/liftAll.lft warn stdin \
      | liftUp -type=.psl stdout ../../jkStuff/liftAll.lft warn stdin \
      | pslSortAcc nohead chromPile /cluster/store2/temp stdin

    # Get rid of big pile-ups due to contamination as so:
    mkdir chrom
    cd chromPile
    foreach i (*.psl)
      echo $i
      pslUnpile -maxPile=250 $i ../chrom/${i:r}_blatMus.psl
    end
    # Load into database:
    ssh hgwdev
    cd ~/oo/bed/blatMus/chrom
    hgLoadPsl hg13 *.psl


PRODUCING CROSS_SPECIES mRNA ALIGNMENTS (DONE 12/17/02 - MATT)

Here you align vertebrate mRNAs against the masked genome on the
cluster you set up during the previous step.

o - Make sure that gbpri, gbmam, gbrod, and gbvert are downloaded from 
    Genbank into /cluster/store1/genbank.132 (in GETTING FRESH mRNA AND 
    EST SEQUENCE FROM GENBANK step)

o - Process these out of genbank flat files as so:
       ssh kkstore
       cd /cluster/store1/mrna.132
       faSplit sequence xenoRna.fa 2 xenoRna
       mkdir -p /scratch/hg/mrna.132
       cp /cluster/store1/mrna.132/xenoRna*.* /scratch/hg/mrna.132
    Request binrysnc of /scratch/hg/mrna.132 from the admins

Set up cluster run.  First make sure genome is in
kkstore:/scratch/hg/gs.14/build31/trfFa.1204 in RepeatMasked + trf form.
(This is probably done already in mouse alignment stage).  Also make
sure /scratch/hg/mrna.132 is loaded with xenoRna.fa Then do:

       ssh kkstore
       mkdir -p ~/oo/bed/xenoMrna
       cd ~/oo/bed/xenoMrna
       mkdir psl
       ls -1S /scratch/hg/gs.14/build31/trfFa.1204/*.fa.trf > human.lst
       ls -1S /scratch/hg/mrna.132/Homo_sapiens/xenoRna*.fa > mrna.lst
       cp ~/hg12/bed/xenoMrna/gsub .
       gensub2 human.lst mrna.lst gsub spec
       para create spec
       ssh kk
       cd ~/oo/bed/xenoMrna
       para try
       para check
       para push 

Do para check until the run is done, doing para push if
necessary on occassion.

Sort xeno mRNA alignments as so:
       ssh eieio
       cd ~/oo/bed/xenoMrna
       pslSort dirs raw.psl /cluster/store2/temp psl
       pslReps raw.psl cooked.psl /dev/null -minAli=0.25
       liftUp chrom.psl ../../jkStuff/liftAll.lft warn cooked.psl
       pslSortAcc nohead chrom /cluster/store2/temp chrom.psl
       pslCat -dir chrom > xenoMrna.psl
       rm -r chrom raw.psl cooked.psl chrom.psl

Load into database as so:
       ssh hgwdev
       cd ~/oo/bed/xenoMrna
       hgLoadPsl hg13 xenoMrna.psl -tNameIx

# Load the RNA and EST data
       cd /gbdb/hg13/mrna.132
       hgLoadRna add -type=xenoRna hg13 /gbdb/hg13/mrna.132/xenoRna.fa /gbdb/hg13/mrna.132/xenoRna.ra
       hgLoadRna add -type=xenoEst hg13 /gbdb/hg13/mrna.132/xenoEst.fa /gbdb/hg13/mrna.132/xenoEst.ra

Similarly do xenoEst aligments:
   Prepare the est data:
        ssh kkstore
        cd /scratch/hg/mrna.132/Homo_sapiens
        faSplit sequence  /cluster/store1/mrna.132/xenoEst.fa 16 xenoEst    

       cd /cluster/store4/gs.14/build31/bed
       mkdir xenoEst
       cd xenoEst
       mkdir psl
       ls -1S /scratch/hg/gs.14/build31/trfFa.1204/*.fa.trf > human.lst
       ls -1S /scratch/hg/mrna.132/Homo_sapiens/xenoEst?*.fa > mrna.lst
       cp ~/hg12/bed/xenoEst/gsub .

Request a binrysnc from the admin's of kkstore's /scratch/hg/mrna.132
When done, do:
       ssh kk
       gensub2 human.lst mrna.lst gsub spec
       para create spec
       para push

Sort xenoEst alignments:
       ssh eieio
       cd ~/oo/bed/xenoEst
       pslSort dirs raw.psl /cluster/store2/temp psl
       pslReps raw.psl cooked.psl /dev/null -minAli=0.10
       liftUp chrom.psl ../../jkStuff/liftAll.lft warn cooked.psl
       pslSortAcc nohead chrom /cluster/store2/temp chrom.psl
       pslCat -dir chrom > xenoEst.psl
       rm -r chrom raw.psl cooked.psl chrom.psl

Load into database as so:
       ssh hgwdev
       cd ~/oo/bed/xenoEst
       hgLoadPsl hg13 xenoEst.psl -tNameIx

PRODUCING FUGU ALIGNMENTS (DONE 12/12/02 - MATT)
o - Do fugu/human alignments.
       ssh kk
       cd ~/oo/bed
       mkdir blatFugu
       cd blatFugu
       mkdir psl
       foreach f (~/oo/?{,?}/NT_??????/NT_??????.fa)
         set c=$f:t
         mkdir -p psl/$c
       end
       ls -1S /scratch/hg/fugu/split500/*.fa > fugu.lst
       ls -1S /scratch/hg/gs.14/build31/trfFa.1204/*.fa.trf > human.lst
       # Copy over gsub from previous version.
       gensub2 human.lst fugu.lst gsub spec

       para create spec
       para try
     Make sure jobs are going ok with para check.  Then
       para push -maxJobs=10000
     wait about 2 hours and do another
       para push
     do para checks and if necessary para pushes until done
     or use para shove.

o - Sort alignments as so
    ssh eieio
    cd ~/oo/bed/blatFugu
       pslCat -dir psl/NT_??????.fa | \
         liftUp -type=.psl stdout ~/oo/jkStuff/liftAll.lft warn stdin | \
         pslSortAcc nohead chrom temp stdin

o - Rename to correspond with tables as so and
    load into database:
       ssh hgwdev
       cd ~/oo/bed/blatFugu/chrom
       rm -f chr*_blatFugu.psl
       foreach i (chr?{,?}{,_random}.psl)
           set r = $i:r
           mv $i ${r}_blatFugu.psl
       end
       hgLoadPsl hg13 *.psl

Make fugu symlink
cd /gbdb/hg13
mkdir fuguSeq
cd fuguSeq
ln -s /cluster/store3/fuguSeq/fugu_v3_mask.fasta
Now load the Fugu sequence data
       hgLoadRna addSeq hg13 /gbdb/hg13/fuguSeq/fugu_v3_mask.fasta


TIGR GENE INDEX (DONE 2/26/03)
  o mkdir -p ~/hg13/bed/tigr    
    cd ~/hg13/bed/tigr  
    wget ftp://ftp.tigr.org/private/HGI_ren/TGI_track_HumanGenome_build31.tgz
    tar xvzf TGI*.tgz
    foreach f (*cattle*)
      set f1 = `echo $f | sed -e 's/cattle/cow/g'`
      mv $f $f1
    end
    foreach o (mouse cow human pig rat)
      setenv O $o
      foreach f (chr*_$o*s)
        tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
      end
    end
    ldHgGene -exon=TC hg13 tigrGeneIndex *.gff



LOAD STS MAP (todo) TODO BY TERRY I BELIEVE - HE WILL UPDATE THIS
     - login to hgwdev
      cd ~/oo/bed
      hg13 < ~/src/hg/lib/stsMap.sql
      mkdir stsMap
      cd stsMap
      bedSort /projects/cc/hg/mapplots/data/tracks/build31/stsMap.bed stsMap.bed
      - Enter database with "hg13" command.
      - At mysql> prompt type in:
          load data local infile 'stsMap.bed' into table stsMap;
      - At mysql> prompt type
          quit


LOAD CHROMOSOME BANDS (todo) ALSO TODO BY TERRY I BELIEVE
      - login to hgwdev
      cd /cluster/store4/gs.14/build31/bed
      mkdir cytoBands
      cp /projects/cc/hg/mapplots/data/tracks/oo.29/cytobands.bed cytoBands
      cd cytoBands
      hg13 < ~/src/hg/lib/cytoBand.sql
      Enter database with "hg13" command.
      - At mysql> prompt type in:
          load data local infile 'cytobands.bed' into table cytoBand;
      - At mysql> prompt type
          quit

LOAD MOUSEREF TRACK (todo)
    First copy in data from eieio to ~/oo/bed/mouseRef.  
    Then substitute 'genome' for the appropriate chromosome 
    in each of the alignment files.  Finally do:
       hgRefAlign webb hg13 mouseRef *.alignments

LOAD AVID MOUSE TRACK (todo)
      ssh cc98
      cd ~/oo/bed
      mkdir avidMouse
      cd avidMouse
      wget http://pipeline.lbl.gov/tableCS-LBNL.txt
      hgAvidShortBed *.txt avidRepeat.bed avidUnique.bed
      hgLoadBed avidRepeat avidRepeat.bed
      hgLoadBed avidUnique avidUnique.bed

LOAD SNPS (done; D Thomas; build 111 on 3/4/03)
      ssh hgwdev
      cd ~/oo/bed
      mkdir snp
      cd snp
      mkdir build111
      cd build111
      ln -s ../../../seq_contig.md .
      ln -s ../build110/filter.awk .

     -Download SNPs from ftp://ftp.ncbi.nlm.nih.gov/human/snp/human.b31.snp111
     -Unpack
      calcFlipSnpPos seq_contig.md human.b31.snp111 human.b31.snp111.flipped
      mv human.b31.snp111 human.b31.snp111.original
      gzip human.b31.snp111.original

      grep RANDOM       human.b31.snp111.flipped >  snpTsc.txt
      grep MIXED        human.b31.snp111.flipped >> snpTsc.txt
      grep BAC_OVERLAP  human.b31.snp111.flipped >  snpNih.txt
      grep OTHER        human.b31.snp111.flipped >> snpNih.txt

      awk -f filter.awk snpTsc.txt > snpTsc.contig.bed
      awk -f filter.awk snpNih.txt > snpNih.contig.bed

      liftUp snpTsc.bed ../../../jkStuff/liftAll.lft warn snpTsc.contig.bed
      liftUp snpNih.bed ../../../jkStuff/liftAll.lft warn snpNih.contig.bed

      hgLoadBed hg13 snpTsc snpTsc.bed
      hgLoadBed hg13 snpNih snpNih.bed

     -gzip all of the big files

LOAD ENSEMBL GENES (TODO)
     cd ~/oo/bed
     mkdir ensembl
     cd ensembl

        Get the ensembl gene data as below:
        GET http://www.ebi.ac.uk/~stabenau/human_8_30.gtf.gz > ensGene.gz
        (The above may only be a temproary location)

        Get the ensembl protein data from http://www.ensembl.org/Homo_sapiens/martview
        Follow this sequence through the pages:
        Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
        Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
        Page 3) Choose the "Structures" box. 
        Page 4) Choose Transcripts/Proteins and GTF as the ouput, choose gzip compression and then hit export.

        gunzip the file and name to ensembl.gtf


# Ensembl handles random chromosomes differently than us, so we
# strip this data.  Fortunately it just loses a couple of genes.
     grep -v ^6_DR51 ensembl.gtf | grep -v _NT_ > unrandom.gtf

# Add "chr" to front of each line in the gene data gtf file to make 
# it compatible with ldHgGene 
     ~matt/bin/addchr.pl unrandom.gtf ensGene.gtf
     ./fixEns.pl ensGene.gtf ensFixed.gtf 
     ldHgGene hg13 ensGene ensGene.gtf

o - Load Ensembl peptides:
     Get them from ensembl as above in the gene section except for
     Page 3) Choose the "Sequences" box. 
     Page 4) Choose GTF as the ouput, choose gzip compression and then hit export.

     Substitute ENST for ENSP in ensPep with the program called subs
     edit subs.in to read: ENSP|ENST
     subs -e ensPep.fa > /dev/null

     Run fixPep.pl ensPep.fa ensembl.pep
     hgPepPred hg13 generic ensPep ensembl.pep


LOAD VEGA GENES AND PSEUDOGENES (DONE 04/24/03)
    mkdir ~/hg13/bed/vega
    cd ~/hg13/bed/vega
    # Saved email attachment from Steve Searle <searle@sanger.ac.uk> to
    # ~/hg13/bed/vega/vegagtf.tar.gz
    gunzip -c vegagtf.tar.gz | tar xvf -
    # Add "chr" to the beginning of each GTF line:
    foreach f (chr*.gtf)
      sed -e 's/^/chr/' $f > $f:r-fixed.gtf
    end
    # The pep files need their not-yet-stable otter ID's translated to 
    # the transcript_id's used in the GTF.  
    foreach f (chr*.fa)
      set c = `echo $f:t:r | awk -F_ '{print $1;}'`
      set g = ${c}_from_ev.gtf
      ./fixOtterIds.pl $g $f > $f:r-fixed.fa
    end
    cat chr*.gtf.tab > vegaInfo.tab
    # Load genes and Immunoglobulin/Pseudogenes into 2 separate tracks
    cat chr*-fixed.gtf \
    | egrep -vw '(Pseudogene|Ig_Segment|Ig_Pseudogene_Segment)' \
    | ldHgGene hg13 vegaGene stdin
    cat chr*-fixed.gtf \
    | egrep -w '(Pseudogene|Ig_Segment|Ig_Pseudogene_Segment)' \
    | ldHgGene hg13 vegaPseudoGene stdin
    hgPepPred hg13 generic vegaPep chr*-fixed.fa
    hgsql hg13 < ~/kent/src/hg/lib/vegaInfo.sql
    echo "load data local infile 'vegaInfo.tab' into table vegaInfo" \
    | hgsql hg13
    # Set cdsStart and cdsEnd to 0 if method is Novel_Transcript
    foreach ntname (`echo 'select name from vegaGene,vegaInfo \
                           where vegaGene.name = vegaInfo.transcriptId AND \
                                 vegaInfo.method = "Novel_Transcript"' \
                     | hgsql -N hg13`)
      echo "update vegaGene set cdsStart = 0 where name = '$ntname'" \
      | hgsql hg13
      echo "update vegaGene set cdsEnd = 0 where name = '$ntname'" \
      | hgsql hg13
    end

LOAD RNAGENES
      ssh hgwdev
      mkdir -p ~/hg13/bed/rnaGene
      cd ~/hg13/bed/rnaGene
      wget ftp://ftp.genetics.wustl.edu/pub/eddy/pickup/ncrna-hg13.gff.gz
      gunzip -c ncrna-hg13.gff.gz | grep -v '^#' > contig.gff
      liftUp chrom.gff ../../jkStuff/liftAll.lft warn contig.gff
      echo 'drop table hgRnaGene;' | hgsql hg13
      hgsql hg13 < ~/kent/src/hg/lib/rnaGene.sql
      hgRnaGenes hg13 chrom.gff

LOAD EXOFISH (todo)
     - login to hgwdev
     - cd /cluster/store4/gs.14/build31/bed
     - mkdir exoFish
     - cd exoFish
     - hg13 < ~kent/src/hg/lib/exoFish.sql
     - Put email attatchment from Olivier Jaillon (ojaaillon@genoscope.cns.fr)
       into /cluster/store4/gs.14/build31/bed/exoFish/all_maping_ecore
     - awk -f filter.awk all_maping_ecore > exoFish.bed
     - hgLoadBed hg13 exoFish exoFish.bed

LOAD MOUSE SYNTENY (TO DO)
     ssh hgwdev
     mkdir -p ~/oo/bed/mouseSyn
     cd ~/oo/bed/mouseSyn
     # Saved Michael Kamal's email attachment: allDirectedSegmentsBySize300.txt
     # Process the .txt file (minus header) into a bed 6 + file:
     grep -v "^#" allDirectedSegmentsBySize300.txt \
       | awk '($6 > $5) {printf "%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\n", $4, $5-1, $6, $1, 999, $7, $2-1, $3, $8;} \
              ($5 > $6) {printf "%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\n", $4, $6-1, $5, $1, 999, $7, $2-1, $3, $8;}' \
       > mouseSynWhd.bed
     hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/mouseSynWhd.sql \
       hg13 mouseSynWhd mouseSynWhd.bed

LOAD GENIE (todo)
     - cat */ctg*/ctg*.affymetrix.gtf > predContigs.gtf
     - liftUp predChrom.gtf ../../jkStuff/liftAll.lft warn predContigs.gtf
     - ldHgGene hg13 genieAlt predChrom.gtf

     - cat */ctg*/ctg*.affymetrix.aa > pred.aa
     - hgPepPred hg13 genie pred.aa 

     - hg13
         mysql> delete * from genieAlt where name like 'RS.%';
         mysql> delete * from genieAlt where name like 'C.%';

LOAD SOFTBERRY GENES (DONE 02/04/03)
     mkdir -p ~/oo/bed/softberry
     cd ~/oo/bed/softberry
     wget ftp://www.softberry.com/pub/sc_fgenesh_hum_nov02up/sc_fgenesh_hum_nov02up.tar.gz
     gunzip -c sc_fgenesh_hum_nov02up.tar.gz | tar xvf -
     ldHgGene hg13 softberryGene chr*.gff
     hgPepPred hg13 softberry *.protein
     hgSoftberryHom hg13 *.protein

LOAD GENEID GENES (DONE 01/28/03)
    mkdir ~/oo/bed/geneid
    cd ~/oo/bed/geneid
    mkdir download
    cd download
    # Now download *.gtf and *.prot from 
    wget -r http://www1.imim.es/genepredictions/H.sapiens/golden_path_20021114/geneid_v1.1/
    # oops, due to links in the index.html, it tries to get too much.  
    # ctrl-c it when it starts to download other directories.
    mv www1.imim.es/genepredictions/H.sapiens/golden_path_20021114/geneid_v1.1/*.{gtf,prot} .
    rm -r www1.imim.es/
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene hg13 geneid download/*.gtf -exon=CDS
    hgPepPred hg13 generic geneidPep download/*-fixed.prot

LOAD ACEMBLY (DONE 02/14/03)
    mkdir -p ~/oo/bed/acembly
    cd ~/oo/bed/acembly
    # Get acembly*gene.gff from Jean and Danielle Thierry-Mieg
    wget ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_31.human.genes/acembly.ncbi_31.genes.proteins.fasta.tar.gz
    wget ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_31.human.genes/acembly.ncbi_31.genes.gff.tar.gz
    gunzip -c acembly.ncbi_31.genes.gff.tar.gz | tar xvf -
    gunzip -c acembly.ncbi_31.genes.proteins.fasta.tar.gz | tar xvf -
    cd acembly.ncbi_31.genes.gff
    # Save just the floating-contig features to different files for lifting 
    # and lift up the floating-contig features to chr*_random coords:
    foreach f (acemblygenes.*.gff)
      set c=$f:r:e
      egrep '^[a-zA-Z0-9]+\|NT_[0-9][0-9][0-9][0-9][0-9][0-9]' $f | \
        perl -wpe 's/^(\w+)\|(\w+)/$1\/$2/' > ctg-chr${c}_random.gff
      if (-e ../../../$c/lift/random.lft) then
        liftUp chr${c}_random.gff ../../../$c/lift/random.lft warn \
          ctg-chr${c}_random.gff
      endif
      # Strip out _random or floating contig lines from the normal chrom gff,
      # and add the "chr" prefix:
      grep -v ^$c\| $f | grep -v ^Hs | perl -wpe 's/^/chr/;' > chr$c.gff
    end

    cd ../acembly.ncbi_31.genes.proteins.fasta
    #- Remove G_t*_ prefixes from acemblyproteins.*.fasta:
    foreach f (acemblyproteins.*.fasta)
      perl -wpe 's/^\>G_t[\da-zA-Z]+_/\>/' $f > chr$f:r:e.fa
    end

    #- Load into database:
    cd ..
    ldHgGene hg13 acembly acembly.ncbi_31.genes.gff/chr*.gff
    hgPepPred hg13 generic acemblyPep \
      acembly.ncbi_31.genes.proteins.fasta/chr*.fa

LOAD GENOMIC DUPES (done June 2003 Heather and Jim)
o - Load genomic dupes
    ssh hgwdev
    cd /cluster/store4/gs.14/build31/bed
    mkdir segmentalDups
    cd segmentalDups
    wget http://codon/jab/web/takeoff/oo33_dups_for_kent.zip
    unzip *.zip
    awk -f filter.awk oo33_dups_for_kent > genomicSuperDups.bed
    mysql -u hgcat -pbigSECRET hg13 < ~/src/hg/lib/genomicSuperDups.sql
    echo "load data local infile 'genomicSuperDups.bed' into table genomicSuperDups" | hg13

LOAD NCI60 ( DONE. Mon May  5 14:03:23 PDT 2003 - sugnet )
o - # ssh hgwdev
    cd /projects/cc/hg/mapplots/data/NCI60/dross_arrays_nci60/
    mkdir hg13
    cd hg13
    findStanAlignments hg13 ../BC2.txt.ns ../../image/cumulative_plates.011204.list.human hg13.image.psl >& hg13.image.log 
    cp ../experimentOrder.txt ./
    sed -e 's/ / \.\.\//g' < experimentOrder.txt > epo.txt
    egrep -v unknown hg13.image.psl > hg13.image.good.psl
    stanToBedAndExpRecs  hg13.image.good.psl hg13.nci60.exp hg13.nci60.bed `cat epo.txt`
    hg13S -A < ../../scripts/nci60.sql
    echo "load data local infile 'hg13.nci60.bed' into table nci60" | hg13S -A
    mkdir /cluster/store4/gs.14/build31/bed/nci60
    mv hg13.nci60.bed /cluster/store4/gs.14/build31/bed/nci60
    rm *.psl 

LOAD AFFYRATIO [GNF] (TO DO)
o - # ssh hgwdev
    cd /cluster/store1/sugnet/
    mkdir gs.14
    mkdir gs.14/build31
    mkdi20r gs.14/build31/affyGnf
    cd gs.14/build31/affyGnf
    cp /projects/compbiodata/microarray/affyGnf/sequences/HG-U95Av2_target ./
    ls -1 /cluster/store4/gs.14/build31/trfFa.1204/ > allctg.lst
    echo "/cluster/store1/sugnet/gs.14/build31/affyGnf/HG-U95Av2_target" > affy.lst
    echo '#LOOP\n/cluster/bin/i386/blat -mask=lower -minIdentity=95 -ooc=/cluster/store4/gs.14/build31/jkStuff/post.refCheck.old/11.ooc /cluster/store4/gs.14/build31/trfFa.1204/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
    gensub2 allctg.lst affy.lst template.sub para.spec
    # ssh kkr1u00
    para create para.spec
    para try
    para check
    para push
    # exit kkr1u00
    pslSort dirs hg13.affy.psl tmp psl >& pslSort.log
    liftUp hg13.affy.lifted.psl /cluster/store4/gs.14/build31/jkStuff/liftAll.lft warn hg13.affy.psl
    pslAffySelect seqIdent=.95 basePct=.95 in=hg13.affy.lifted.psl out=hg13.affy.pAffySelect.95.95.psl
    affyPslAndAtlasToBed hg13.affy.pAffySelect.95.95.psl  /projects/compbiodata/microarray/affyGnf/human_atlas_U95_gnf.noquotes.txt affyRatio.bed affyRatio.exr >& affyPslAndAtlasToBed.log 
    hg13S -A </projects/compbiodata/microarray/affyGnf/browserFiles/affyRatio.sql 
    echo "load data local infile 'affyRatio.bed' into table affyRatio" | hg13S -A
    mkdir /cluster/store4/gs.14/build31/bed/affyGnf
    rm -rf psl tmp err *.psl *.bed HG-U95Av2_target 

LOAD SAGE DATA (DONE 03/16/03)
    ssh hgwdev
    cd ~/kent/src/hg/sage
    make
    cd /projects/cc/hg/sugnet/sage
    mkdir sage.XXX # XXX = current uniGene build
    cd sage.XXX
    ncftp ftp://ftp.ncbi.nih.gov/pub/sage
      mget -R map/readme.txt map/info.txt extr info map/Hs
      quit
    mkdir map
    mv Hs map
    cd map/Hs/NlaIII
    unzip -j SAGEmap_tag_ug-rel.zip
    cd ../../../extr/
    ../../scripts/summarizeCounts.pl expCounts.tab ./SAGE_*
    ../../scripts/countGenesPerTag.pl expCounts.tab allTags.count.tab
    ../../scripts/createArraysForTags.pl allTags.count.tab tagExpArrays.tab \
      ./SAGE_*
    ../../scripts/countsPerExp.pl expCounts.tab expList.tab
    cd ../map/Hs/NlaIII/ 
    perl -e 'while (<>) { \
               chomp($_); \
               @p = split(/\t/, $_); \
               print "$p[2]\t$p[3]\t$p[0]\n"\
             }' \
      < SAGEmap_tag_ug-rel | sort | sed -e 's/ /_/g' \
      > SAGEmap_ug_tag-rel_Hs
    cd -
    createSageSummary ../map/Hs/NlaIII/SAGEmap_ug_tag-rel_Hs \
      tagExpArrays.tab sageSummary.sage
    # Create the uniGene alignments 
    # ~/hg13/uniGene/hg13.uniGene.lifted.pslReps.psl
    # -- see "MAKE UNIGENE ALIGNMENTS" below
    cd /projects/cc/hg/sugnet/sage/sage.$version/extr
    addAveMedScoreToPsls \
      ~/hg13/bed/uniGene.$version/hg13.uniGene.lifted.pslReps.psl \
      sageSummary.sage  uniGene.wscores.bed
    hgLoadBed hg13 uniGene_2 uniGene.wscores.bed
    hgsql hg13 < ~kent/src/hg/lib/sage.sql 
    echo "load data local infile 'sageSummary.sage' into table sage" \
        | hgsql hg13
    cd ../info
    ../../scripts/parseRecords.pl ../extr/expList.tab  > sageExp.tab
    hgsql hg13 < ~/kent/src/hg/lib/sageExp.sql 
    echo "load data local infile 'sageExp.tab' into table sageExp" | hgsql hg13
    # update ~/kent/src/hg/makeDb/trackDb/human/hg13/uniGene_2.html 
    # with current uniGene date. 

    
MAKE UNIGENE ALIGNMENTS (DONE 03/16/03)
    # Download of the latest UniGene version is now automated by a 
    # cron job -- see /cluster/home/angie/crontab , 
    # /cluster/home/angie/unigeneVers/unigene.csh .  
    # If hgwdev gets rebooted, that needs to be restarted... maybe there's 
    # a more stable place to set up that cron job.  

    # substitute XXX -> the uniGene version used by SAGE, if building the 
    # uniGene/SAGE track;  or just the latest uniGene version in 
    # /projects/cc/hg/sugnet/uniGene/ , if doing uniGene alignments only.
    set version = XXX
    cd /projects/cc/hg/sugnet/uniGene/uniGene.$version
    gunzip Hs.seq.uniq.gz 
    ../countSeqsInCluster.pl Hs.data counts.tab
    ../parseUnigene.pl Hs.seq.uniq Hs.seq.uniq.simpleHeader.fa leftoverData.tab
    # Distribute UniGene sequence to /iscratch/i/ (kkstore can see /projects)
    ssh kkstore
    set version = XXX # same as above
    mkdir -p /iscratch/i/uniGene.$version
    cp -p \
  /projects/cc/hg/sugnet/uniGene/uniGene.$version/Hs.seq.uniq.simpleHeader.fa \
      /iscratch/i/uniGene.$version
    ssh kkr1u00
    ~kent/bin/iSync
    ssh kk
    set version = XXX # same as above
    mkdir -p ~/hg13/bed/uniGene.$version
    cd ~/hg13/bed/uniGene.$version
    ls -1S /cluster/store4/gs.14/build31/trfFa/* > allctg.lst
    ls -1  /iscratch/i/uniGene.$version/Hs.seq.uniq.simpleHeader.fa \
      > uniGene.lst
    echo '#LOOP\n/cluster/bin/i386/blat -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
    gensub2 allctg.lst uniGene.lst template.sub para.spec
    para create para.spec
    mkdir psl
    para try
    para check
    para push
    ssh eieio
    set version = XXX # same as above
    cd ~/hg13/bed/uniGene.$version
    pslSort dirs raw.psl tmp psl >& pslSort.log
    liftUp -type=.psl stdout ../../jkStuff/liftAll.lft warn raw.psl \
    | pslReps -minCover=0.2 -sizeMatters -minAli=0.98 -nearTop=0.002 \
      stdin hg13.uniGene.lifted.pslReps.psl /dev/null
    # use hg13.uniGene.lifted.pslReps.psl for building SAGE track (above).


FAKING DATA FROM PREVIOUS VERSION
(This is just for until proper track arrives.  Rescues about
97% of data  Just an experiment, not really followed through on).

o - Rescuing STS track:
     - log onto hgwdev
     - mkdir ~/oo/rescue
     - cd !$
     - mkdir sts
     - cd sts
     - bedDown hg3 mapGenethon sts.fa sts.tab
     - echo ~/oo/sts.fa > fa.lst
     - pslOoJobs ~/oo ~/oo/rescue/sts/fa.lst ~/oo/rescue/sts g2g
     - log onto cc01
     - cc ~/oo/rescue/sts
     - split all.con into 3 parts and condor_submit each part
     - wait for assembly to finish
     - cd psl
     - mkdir all
     - ln ?/*.psl ??/*.psl *.psl all
     - pslSort dirs raw.psl temp all
     - pslReps raw.psl contig.psl /dev/null
     - rm raw.psl
     - liftUp chrom.psl ../../../jkStuff/liftAll.lft carry contig.psl
     - rm contig.psl
     - mv chrom.psl ../convert.psl


LOADING MOUSE MM3 BLASTZ ALIGNMENTS FROM PENN STATE: (IN PROGRESS 03/10/03)

    # Translate Penn State .lav files into sorted axt:
    ssh eieio
    set base="/cluster/store4/gs.14/build31/bed/blastz.mm3.2003-03-09-ASH"
    set seq1_dir="/cluster/store4/gs.14/build31/mixedNib/"
    set seq2_dir="/cluster/store2/mm.2003.02/mm3/mixedNib/"
    set tbl="blastzMm"
    cd $base
    mkdir -p axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=$base/axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | lavToAxt stdin $seq1_dir $seq2_dir stdout \
        | axtSort stdin $out
      popd
    end

    # Translate the sorted axt files into psl:
    cd $base
    mkdir -p pslChrom
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # Load tables
TODO    ssh hgwdev
    set base="/cluster/store4/gs.14/build31/bed/blastz.mm3.2003-03-09-ASH"
    set tbl="blastzMm"
    cd $base/pslChrom
    hgLoadPsl hg13 chr*_${tbl}.psl

MAKING THE BLASTZBESTMOUSE TRACK FROM PENN STATE MM3 AXT FILES (IN PROGRESS 03/10/03)

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh eieio
    set base="/cluster/store4/gs.14/build31/bed/blastz.mm3.2003-03-09-ASH"
    set tbl="blastzBestMouse"
    cd $base
    mkdir -p axtBest pslBest
    foreach chrdir (lav/chr*)
      set chr=$chrdir:t
      echo axtBesting $chr
      axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300
      echo translating axtBest to psl for $chr
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
    end
    # If a chromosome has so many alignments that axtBest runs out of mem,
    # run axtBest in 2 passes to reduce size of the input to final axtBest:
    foreach chrdir (lav/chr19)
      set chr=$chrdir:t
      echo two-pass axtBesting $chr
      foreach a ($chrdir/*.axt)
        axtBest $a $chr $a:r.axtBest
      end
      cat `ls -1 $chrdir/*.axtBest | sort -g` \
        > $chrdir/$chr.axtBestPieces
      axtBest $chrdir/$chr.axtBestPieces $chr axtBest/$chr.axt
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
    end

    # Load tables
     ssh hgwdev
     set base="/cluster/store4/gs.14/build31/bed/blastz.mm3.2003-03-09-ASH"
     set tbl="blastzBestMouse"
     cd $base/pslBest
     hgLoadPsl hg13 chr*_${tbl}.psl

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/hg13/axtBestMm3
     cd /gbdb/hg13/axtBestMm3
     foreach f ($base/axtBest/chr*.axt)
       ln -s $f .
     end
     cd $base/axtBest
     rm -f axtInfoInserts.sql
     touch axtInfoInserts.sql
     foreach f (/gbdb/hg13/axtBestMm3/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo VALUES ('mm3','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
     hgsql hg13 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql hg13 < axtInfoInserts.sql

MAKING THE AXTTIGHT FROM AXTBEST (IN PROGRESS 03/10/03)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh eieio
    cd ~/hg13/bed/blastz.mm3.2003-03-09-ASH/axtBest
    mkdir -p ../axtTight
    foreach i (*.axt)
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end
    # translate to psl
    cd ../axtTight
    mkdir -p ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightMouse.psl
    end
    # Load tables into database
    ssh hgwdev
    cd ~/hg13/bed/blastz.mm3.2003-03-09-ASH/pslTight
    hgLoadPsl hg13 chr*_blastzTightMouse.psl

BEGINNING OF RAT BLASTZ

LOADING RAT RN2 BLASTZ ALIGNMENTS FROM PENN STATE: (DONE 03/14/03)

    # Translate Penn State .lav files into sorted axt:
    ssh eieio
    set base="/cluster/store4/gs.14/build31/bed/blastz.rn2.2003-03-13-ASH"
    set seq1_dir="/cluster/store4/gs.14/build31/mixedNib/"
    set seq2_dir="/cluster/store4/rn2/mixedNib/"
    set tbl="blastzRn2"
    cd $base
    mkdir -p axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=$base/axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | lavToAxt stdin $seq1_dir $seq2_dir stdout \
        | axtSort stdin $out
      popd
    end

# STOPPED HERE -- big data, low demand.
    # Translate the sorted axt files into psl:
    cd $base
    mkdir -p pslChrom
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    # Load tables
    ssh hgwdev
    set base="/cluster/store4/gs.14/build31/bed/blastz.rn2.2003-03-13-ASH"
    set tbl="blastzRn2"
    cd $base/pslChrom
    hgLoadPsl hg13 chr*_${tbl}.psl

MAKING THE BLASTZBESTRAT TRACK FROM PENN STATE RN2 AXT FILES (DONE 03/14/03)

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh eieio
    set base="/cluster/store4/gs.14/build31/bed/blastz.rn2.2003-03-13-ASH"
    set tbl="blastzBestRn2"
    cd $base
    mkdir -p axtBest pslBest
    foreach chrdir (lav/chr*)
      set chr=$chrdir:t
      echo axtBesting $chr
      axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300
      echo translating axtBest to psl for $chr
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
    end

    # Load tables
     ssh hgwdev
     set base="/cluster/store4/gs.14/build31/bed/blastz.rn2.2003-03-13-ASH"
     set tbl="blastzBestRn2"
     cd $base/pslBest
     hgLoadPsl hg13 chr*_${tbl}.psl

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/hg13/axtBestRn2
     cd /gbdb/hg13/axtBestRn2
     foreach f ($base/axtBest/chr*.axt)
       ln -s $f .
     end
     cd $base/axtBest
     rm -f axtInfoInserts.sql
     touch axtInfoInserts.sql
     foreach f (/gbdb/hg13/axtBestRn2/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo VALUES ('rn2','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
     hgsql hg13 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql hg13 < axtInfoInserts.sql

MAKING THE AXTTIGHT FROM AXTBEST (DONE 03/14/03)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh eieio
    cd ~/hg13/bed/blastz.rn2.2003-03-13-ASH/axtBest
    mkdir -p ../axtTight
    foreach i (*.axt)
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end
    # translate to psl
    cd ../axtTight
    mkdir -p ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightRn2.psl
    end
    # Load tables into database
    ssh hgwdev
    cd ~/hg13/bed/blastz.rn2.2003-03-13-ASH/pslTight
    hgLoadPsl hg13 chr*_blastzTightRn2.psl

XXX END OF RAT BLASTZ

BEGINNING OF HUMAN BLASTZ

LOADING HUMAN HG13 (SELF) BLASTZ ALIGNMENTS: (DONE 02/20/03)

    # Translate Penn State .lav files into sorted axt, with alignments 
    # to self/diagonal dropped:
    ssh eieio
    set base="/cluster/store4/gs.14/build31/bed/blastz.hg13.2003-02-18-ASH"
    set seq1_dir="/cluster/store4/gs.14/build31/mixedNib/"
    set seq2_dir="/cluster/store4/gs.14/build31/mixedNib/"
    set tbl="blastzHuman"
    cd $base
    mkdir -p axtChrom
    # sometimes alignments are so huge that they cause axtSort to run out 
    # of memory.  Run them in two passes like this:
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=$base/axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      foreach d (*.lav)
        set smallout=$d.axt
        lavToAxt $d $seq1_dir $seq2_dir stdout \
        | axtDropSelf stdin stdout \
        | axtSort stdin $smallout
      end
      cat `ls -1 *.lav.axt | sort -g` \
        > $out
      popd
    end

    # Translate the sorted axt files into psl:
    cd $base
    mkdir -p pslChrom
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo translating $c.axt to psl
      axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # Load tables
    ssh hgwdev
    set base="/cluster/store4/gs.14/build31/bed/blastz.hg13.2003-02-18-ASH"
    set tbl="blastzHuman"
    cd $base/pslChrom
    hgLoadPsl hg13 chr*_${tbl}.psl

MAKING THE BLASTZBESTHUMAN TRACK FROM UNFILTERED AXT FILES (DONE 02/20/03)

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh eieio
    set base="/cluster/store4/gs.14/build31/bed/blastz.hg13.2003-02-18-ASH"
    set tbl="blastzBestHuman"
    cd $base
    mkdir -p axtBest pslBest
    # run axtBest in 2 passes to reduce size of the input to final axtBest:
    foreach chrdir (lav/*)
      set chr=$chrdir:t
      echo two-pass axtBesting $chr
      foreach a ($chrdir/*.axt.gz)
        gunzip -c $a \
        | axtBest stdin $chr $a:r.axtBest
      end
      cat `ls -1 $chrdir/*.axtBest | sort -g` \
        > $chrdir/$chr.axtBestPieces
      axtBest $chrdir/$chr.axtBestPieces $chr axtBest/$chr.axt
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
    end

    # Load tables
     ssh hgwdev
     set base="/cluster/store4/gs.14/build31/bed/blastz.hg13.2003-02-18-ASH"
     set tbl="blastzBestHuman"
     cd $base/pslBest
     hgLoadPsl hg13 chr*_${tbl}.psl

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/hg13/axtBestHg13
     cd /gbdb/hg13/axtBestHg13
     foreach f ($base/axtBest/chr*.axt)
       ln -s $f .
     end
     cd $base/axtBest
     rm -f axtInfoInserts.sql
     touch axtInfoInserts.sql
     foreach f (/gbdb/hg13/axtBestHg13/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo VALUES ('hg13','Blastz Best Human Self','$chr','$f');" \
         >> axtInfoInserts.sql
     end
     hgsql hg13 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql hg13 < axtInfoInserts.sql

MAKING THE AXTTIGHT FROM AXTBEST (DONE 02/20/03)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh eieio
    cd ~/hg13/bed/blastz.hg13.2003-02-18-ASH/axtBest
    mkdir -p ../axtTight
    foreach i (*.axt)
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end
    # translate to psl
    cd ../axtTight
    mkdir -p ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightHuman.psl
    end
    # Load tables into database
    ssh hgwdev
    cd ~/hg13/bed/blastz.hg13.2003-02-18-ASH/pslTight
    hgLoadPsl hg13 chr*_blastzTightHuman.psl


XXX END OF HUMAN BLASTZ


LIFTING REPEATMASKER .ALIGN FILES (DONE 12/13/02)

foreach d (?{,?}/NT_??????)
  set c=$d:t
  cd $d
  echo $c to $c.fa.align
  /cluster/bin/scripts/liftRMAlign.pl $c.lft > $c.fa.align
  cd ../..
end

foreach chr (?{,?})
  cd $chr
  echo making symbolic links for chr$chr NT .fa.align files
  foreach ctg (NT_??????)
    ln -s $ctg/$ctg.fa.align
  end
  cd ..
  if (-e $chr/lift/ordered.lft) then
    echo making $chr/chr$chr.fa.align
    /cluster/bin/scripts/liftRMAlign.pl $chr/lift/ordered.lft \
      > $chr/chr$chr.fa.align
  endif
  if (-e $chr/lift/random.lft) then
    echo making $chr/chr${chr}_random.fa.align
    /cluster/bin/scripts/liftRMAlign.pl $chr/lift/random.lft \
      > $chr/chr${chr}_random.fa.align
  endif
  echo removing symbolic links for chr$chr NT .fa.align files
  rm $chr/NT_??????.fa.align
end


MITOCHONDRIAL DNA PSEUDO-CHROMOSOME - TODO

Download the fasta file from http://www.gen.emory.edu/MITOMAP/mitomapRCRS.fasta
Put it in /cluster/store1/mrna.132
ssh hgwdev
cd ~/oo
mkdir M
cp /cluster/store1/mrna.132/mitomapRCRS.fasta M/chrM.fa
Edit jkStuff/makeNib.sh to make sure it also has the "M" directory in its file list
tcsh jkStuff/makeNib.sh
hgNibSeq -preMadeNib hg13 /cluster/store4/gs.14/build31/nib ?/chr*.fa ??/chr*.fa


TWINSCAN GENE PREDICTIONS (DONE 1/9/03)
    mkdir -p ~/oo/bed/twinscanchr_gtf
    cd ~/oo/bed/twinscan
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
      rm -f chr$c.{gtf,ptx}
      wget http://genome.cs.wustl.edu/~bio/human/NCBI31/12-30-02/chr_gtf/chr$c.gtf
      wget http://genome.cs.wustl.edu/~bio/human/NCBI31/12-30-02/chr_ptx/chr$c.ptx
      # clean up chrom name and put chrom in transcript_id:
      perl -wpe 's/^chr(\w+)\.\d+\.\d+(.*)transcript_id "(\d+\.\d+).a"/chr$1$2transcript_id "$1.$3.a"/' \
        < chr$c.gtf > chr$c-fixed.gtf
      # pare down protein FASTA header to id and add missing .a:
      perl -wpe 's/^\>.*\s+source_id\s*\=\s*(\S+)\s+chr=(\w+).*$/\>$2.$1.a/;' \
        < chr$c.ptx > chr$c-fixed.fa
    end
    ldHgGene hg13 twinscan chr*-fixed.gtf -exon=CDS
    hgPepPred hg13 generic twinscanPep chr*-fixed.fa

LOAD CHIMP DATA

# Load Ingo Ebersber's chimp BLAT alignments
cd ~/oo
mkdir bed/blatChimp
cd bed/blatChimp

# Get the chimp sequence
wget http://www.cs.uni-duesseldorf.de/~ebersber/annotation_track_chimp/downloads/mpi-aligned_seqparts_jun02.fa.gz

cp ~/hg12/bed/blatChimp/gsub .
mkdir -p /mnt/scratch/hg/chimp
cp ./mpi-aligned_seqparts_jun02.fa /mnt/scratch/hg/chimp/chimp.fa
ls -1S /mnt/scratch/hg/chimp/chimp.fa > chimp.lst
ls -1S /mnt/scratch/hg/gs.14/build31/trfFa.1204/*.fa.trf > human.lst

mkdir psl
gensub2 human.lst chimp.lst gsub spec
para create spec
para try
para push
para check

o - Sort alignments as so
        ssh eieio
        cd ~/hg13/bed/blatChimp
         pslCat -dir psl | \
         liftUp -type=.psl stdout ~/oo/jkStuff/liftAll.lft warn stdin | \
         pslSortAcc nohead chrom temp stdin
         pslCat -dir chrom > blatChimp.psl

ssh hgwdev
cd hg13/bed/blatChimp
hgLoadPsl hg13 blatChimp.psl         

Make fugu symlink
cd /gbdb/hg13
mkdir fuguSeq
cd fuguSeq
ln -s /cluster/store3/fuguSeq/fugu_v3_mask.fasta
Now load the Fugu sequence data
       hgLoadRna addSeq hg13 /gbdb/hg13/fuguSeq/fugu_v3_mask.fasta

hgLoadPsl hg13 blatChimp.psl


o Load the chimp BAC data
cd ~/oo
mkdir bed/bacChimp
cd bed/bacChimp
wget http://www.cs.uni-duesseldorf.de/~ebersber/annotation_track_chimp/downloads/riken-aligned_seqparts_jun02.fa.gz

cp ~/hg12/bed/bacChimp/gsub .

mkdir -p /mnt/scratch/hg/chimp
cp ./riken-aligned_seqparts_jun02.fa /mnt/scratch/hg/chimp/chimpBac.fa
ls -1S /mnt/scratch/hg/chimp/chimpBac.fa > chimp.lst
ls -1S /mnt/scratch/hg/gs.14/build31/trfFa.1204/*.fa.trf > human.lst

mkdir psl
gensub2 human.lst chimp.lst gsub spec
para create spec
para try
para push
para check

hgLoadPsl hg13 bacChimp.psl


DATABASE DUMP, FYI
# For the first database table dump
do (it's automated after that):

ssh hgwbeta
mkdir /usr/local/apache/htdocs/goldenPath/14nov2002/database

cd /var/tmp
mkdir hg13-dump
chmod 777 hg13-dump      # (since you aren't root this is quickest)
cd hg13-dump
mysqldump --user=hguser --password=hguserstuff --all --tab=. hg11

Then, that directory will quickly fill with .sql and .txt files
When it is done do:

cd /var/tmp/hg13-dump
gzip *.txt
mv * /usr/local/apache/htdocs/goldenPath/14nov2002/database

 - Make database.zip
ssh hgwbeta
cd /usr/local/apache/htdocs/goldenPath/14nov2002/database
zip ../bigZips/database.zip *


SGP GENE PREDICTIONS (DONE 04/04/03)
    mkdir -p ~/oo/bed/sgp/download
    cd ~/oo/bed/sgp/download
    set webroot = http://genome.imim.es/genepredictions/H.sapiens/golden_path_20021114/SGP_mm2003
    foreach f (~/oo/?{,?}/chr?{,?}{,_random}.fa)
      set chr = $f:t:r
      wget $webroot/$chr.gtf
      wget $webroot/$chr.prot
    end
    wget $webroot/chrUn.gtf -O chrUn_random.gtf
    wget $webroot/chrUn.prot -O chrUn_random.prot
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene hg13 sgpGene download/*.gtf -exon=CDS
    hgPepPred hg13 generic sgpPep download/*-fixed.prot

ALIGNED ANCIENT REPEATS FROM MOUSE BLASTZ (DONE 01/15/03)
    ssh eieio
    mkdir -p ~/oo/bed/aarMm2
    cd ~/oo/bed/aarMm2
    set mmdir=../blastz.mm2.2002-12-5-ASH
    foreach aar ($mmdir/aar/*.aar.gz)
      set c = $aar:t:r:r
      echo translating chr$c aar to axt
      zcat $aar \
      | $HOME/kent/src/hg/makeDb/hgAar/aarToAxt \
      | axtToPsl stdin $mmdir/S1.len $mmdir/S2.len stdout \
      > chr${c}_aarMm2.psl
    end   
    ssh hgwdev
    cd ~/oo/bed/aarMm2
    hgLoadPsl hg13 *.psl

ALIGNMENT COUNTS FOR WIGGLE TRACK
    # this needs to be updated to reflected the full process.
 
    - Generate BED table of AARs used to select regions.
        cat ../bed/aarMm2/*.psl | awk 'BEGIN{OFS="\t"} {print $14,$16,$17,"aar"}' >aarMm2.bed
    
    - Generate background counts with windows that have a 6kb counts,
      with a maximum windows size of 512kb and sliding the windows by
        foreach axt (../../blastz.mm2.2002-08-01/axtBest/chr*.axt)
           set chr=$axt:t:r
           set tab=$chr.6kb-aar.cnts  (??? need better name ???)
           hgCountAlign -selectBed=aarMm2.bed -winSize=512000 -winSlide=1000 -fixedNumCounts=6000 -countCoords $axt $tab
        end

    - Generate counts for AARs with 50b windows, slide by 5b
        foreach axt (../../blastz.mm2.2002-08-01/axtBest/chr*.axt)
           set chr=$axt:t:r
           set tab=$chr.50b-aar.cnts  (??? need better name ???)
           hgCountAlign -selectBed=aarMm2.bed -winSize=50 -winSlide=5 $axt $tab
        end

    - Generate counts for all with 50b windows, slide by 5b
        foreach axt (../../blastz.mm2.2002-08-01/axtBest/chr*.axt)
           set chr=$axt:t:r
           set tab=$chr.50b.cnts  (??? need better name ???)
           hgCountAlign -winSize=50 -winSlide=5 $axt $tab
        end


MAKING AND STORING mRNA AND EST ALIGNMENTS (TO DO)

o ssh to eieio
 mkdir -p /cluster/store4/gs.14/build31/bed/refFull
 cd /cluster/store4/gs.14/build31/bed/refFull

 Download the sequence: wget ftp://blue3.ims.u-tokyo.ac.jp/pub/db/hgc/dbtss/ref-full.fa.gz

 gunzip it and split the ref-rull.fa file into about 200 pieces
 gunzip ref-full.fa.gz
 faSplit sequence ref-full.fa 50 splitRefFull
 ssh kkstore
 cd /cluster/store4/gs.14/build31/bed/refFull
 mkdir /scratch/hg/refFull
 splitRefFull* /scratch/hg/refFull/
 ls -1S /scratch/hg/gs.14/build31/contig.0729/*.fa > genome.lst
 ls -1S /scratch/hg/refFull/split*.fa > refFull.lst

o - Request the admins to do a binrsync to the cluster of /scratch/hg/refFull

o - Use BLAT to generate refFull alignments as so:
      Make sure that /scratch/hg/gs.14/build31/contig/ is loaded
      with NT_*.fa and pushed to the cluster nodes.

          ssh kk
          cd /cluster/store4/gs.14/build31/bed/refFull
          mkdir -p psl
#          run mkdirs.sh script to create sudirs in the psl directory
#               in order to modularize the blat job.
          gensub2 genome.lst refFull.lst gsub spec
          para create spec
          
    Now run a para try/push and para check in each one.

o - Process refFull alignments into near best in genome.
      cd ~/oo/bed
      cd refFull
      pslSort dirs raw.psl /tmp psl/*
      pslReps -minCover=0.2 -sizeMatters -minAli=0.98 -nearTop=0.002 raw.psl contig.psl /dev/null
      liftUp -nohead all_refFull.psl ../../jkStuff/liftAll.lft carry contig.psl
      pslSortAcc nohead chrom /tmp all_refFull.psl

o - Load refFull alignments into database
      ssh hgwdev
      cd /cluster/store4/gs.14/build31/bed/refFull
      pslCat -dir chrom > refFullAli.psl
      hgLoadPsl hg13 -tNameIx refFullAli.psl


MAKING PROMOTER FILES
    cd /usr/local/apache/htdocs/goldenPath/14nov2002/bigZips
    featureBits hg13 -fa=upstream1000.fa refGene:upstream:1000
    zip upstream1000.zip upstream1000.fa
    featureBits hg13 -fa=upstream2000.fa refGene:upstream:2000
    zip upstream2000.zip upstream2000.fa
    featureBits hg13 -fa=upstream5000.fa refGene:upstream:5000
    zip upstream5000.zip upstream5000.fa
    rm upstream*.fa
    
    
    #- Take a look at bigZips/* and chromosomes/*, update their README.txt's

SLAM

#   Slam predictions for human originate from both mouse and rat

    cd /cluster/store4/gs.14/build31/bed/slam

    wget http://baboon.math.berkeley.edu/~cdewey/slam/hs_31_Nov2002_mm_3_Feb2002/gff/hsCDS.gff.gz
    gunzip hsCDS.gff.gz
    mv hsCDS.gff humanFromMouseCDS.gff
    ldHgGene -exon=CDS hg13 slamMouse

    wget http://baboon.math.berkeley.edu/~cdewey/slam/hs_31_Nov2002_rn_2_Nov2002/gff/hsCDS.gff.gz
    gunzip hsCDS.gff.gz
    mv hsCDS.gff humanFromRatCDS.gff
    ldHgGene -exon=CDS hg13 slamRat

    wget http://baboon.math.berkeley.edu/~scawley/slam/hs_31_Nov2002_mm_3_Feb2002/gff/hsCNS.bed.gz
    gunzip hsCNS.bed.gz
    mv hsCNS.bed humanFromMouseCNS.bed
    expand.pl < humanFromMouseCNS.bed > humanFromMouseCNS.bed.expand
    hgLoadBed -tab hg13 slamNonCodingMouse humanFromMouseCNS.bed.expand

    wget http://baboon.math.berkeley.edu/~scawley/slam/hs_31_Nov2002_rn_2_Nov2002/gff/hsCNS.bed.gz
    gunzip hsCNS.bed.gz
    mv hsCNS.bed humanFromRatCNS.bed
    expand.pl < humanFromRatCNS.bed > humanFromRatCNS.bed.expand
    hgLoadBed -tab hg13 slamNonCodingRat humanFromRatCNS.bed.expand


CREATING THE humMusL SAMPLE TRACK (a.k.a WIGGLE TRACK)
------------------------------------------------------
o - refer to the script at src/hg/sampleTracks/makeHg13Mm2.doc


#MAKING 
        foreach d ( ?{,?}/NT_* )

CREATING HUMAN/MOUSE/RAT ALIGNMENTS AND LOADING INTO DATABASE

# Create /cluster/store5/multiz.hmr.2 by editing and running
# the script src/hg/mouseStuff/stageMultiz/stage.csh on the file server.
# Note, you may have to insert change the cat line near the end line to
# include the flags
        -tolerate -minScore=0
# if Webb hasn't fixed multiz by the time you read this.
# 
# Next link the maf files into /gbdb as so
     ssh hgwdev
     cd /gbdb/hg13
     mkdir multizMm3Rn2
     ln -s /cluster/store5/multiz.hmr.2/hmr/*.maf multizMm3Rn2
# Now load the database
     hgLoadMaf hg13 multizMm3Rn2


# PREP FOR LIFTOVER CHAINS TO HG13

    # split into 3K chunks
    ssh eieio
    set tempDir = /cluster/bluearc/hg/hg13/liftOver
    mkdir -p $tempDir
    cd $tempDir
    mkdir lift 
cat > split.csh << 'EOF'
    set scratch = /iscratch/i/hg13/liftOver/split
    mkdir -p $scratch
    foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y M)
        echo chr$i
        faSplit -lift=lift/chr$i.lft size /cluster/data/hg13/$i/chr$i.fa -oneFile 3000 $scratch/chr$i
    end
'EOF'
    csh split.csh >&! split.log &
    tail -100f split.log

    ssh kkr1u00
    /cluster/bin/iSync


# LIFTOVER CHAINS TO HG16 FOR ENCODE (2004-03-26 kate)

    # create alignments with blat, using 3K split of hg16, 
    #   doc'ed in /cluster/data/hg16/bed/bedOver/liftOver.doc
    ssh kk
    cd /cluster/data/hg13
    mkdir -p bed/blat.hg16.2004-03-29
    ln -s bed/blat.hg16.2004-03-29 bed/blat.hg16
    cd bed/blat.hg16
    mkdir raw psl run
    cd /cluster/data/hg13/bed/blat.hg16/run
echo '#LOOP' > gsub
echo 'blat $(path1) $(path2) {check out line+ ../raw/$(root1)_$(root2).psl} -tileSize=11 -ooc=/cluster/bluearc/hg/h/11.ooc -minScore=100 -minIdentity=98 -fastMap' >> gsub
echo '#ENDLOOP' >> gsub
    # query
    ls -1S /scratch/hg/gs.17/build34/liftOver/split/*.fa > new.lst
    # target
    ls -1S /cluster/data/hg13/mixedNib/*.nib > old.lst
    gensub2 old.lst new.lst gsub spec
    para create spec
    para try 
    para push

    ssh eieio
    cd /cluster/data/hg13/bed/blat.hg16
    cd raw
cat > liftup.csh << 'EOF'
    set liftDir = /cluster/bluearc/hg/gs.17/build34/liftOver/lift
    foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y M)
        echo chr$i
        liftUp -pslQ ../psl/chr$i.psl $liftDir/chr$i.lft warn chr*_chr$i.psl
        echo done $i
    end
'EOF'
    csh liftup.csh >&! liftup.log &

    cd ..

    # create alignment chains
    ssh kk
    cd /cluster/data/hg13/bed/blat.hg16
    mkdir chainRun chainRaw chain
    cd chainRun
    echo '#LOOP' > gsub
    echo 'axtChain -psl $(path1) /cluster/data/hg13/mixedNib /scratch/hg/gs.17/build34/bothMaskedNibs {check out line+ ../chainRaw/$(root1).chain}' >> gsub
    echo '#ENDLOOP' >> gsub
    ls -1S ../psl/*.psl > in.lst
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push

    ssh eieio
    cd /cluster/data/hg13/bed/blat.hg16
    chainMergeSort chainRaw/*.chain | chainSplit chain stdin
    mkdir net
    cd chain
    foreach i (*.chain)
        chainNet $i /cluster/data/hg13/chrom.sizes \
                /cluster/data/hg16/chrom.sizes ../net/$i:r.net /dev/null
        echo done $i
    end
    mkdir ../over
    foreach i (*.chain)
        netChainSubset ../net/$i:r.net $i ../over/$i
        echo done $i
    end
    cat ../over/*.chain > ../over.chain
    mkdir -p /cluster/data/hg13/bed/bedOver
    cp ../over.chain /cluster/data/hg13/bed/bedOver/hg13ToHg16.over.chain

    # save to download area
    cd /usr/local/apache/htdocs/goldenPath/hg13
    mkdir -p liftOver
    cp /cluster/data/hg13/bed/bedOver/hg13ToHg16.over.chain liftOver
    gzip liftOver/hg13ToHg16.over.chain

    # load into database
    mkdir -p /gbdb/hg13/liftOver
    ln -s /cluster/data/hg13/bed/bedOver/hg13ToHg16.over.chain \
        /gbdb/hg13/liftOver
    hgAddLiftOverChain hg13 hg16


# LIFTOVER CHAIN TO HG15

    cp /cluster/data/hg15/bed/bedOver/over.chain \
        /cluster/data/hg13/bed/bedOver/hg13ToHg15.over.chain

    # save to download area
    cd /usr/local/apache/htdocs/goldenPath/hg13
    mkdir -p liftOver
    cp /cluster/data/hg13/bed/bedOver/hg13ToHg15.over.chain liftOver
    gzip liftOver/hg13ToHg15.over.chain

    # load into database
    mkdir -p /gbdb/hg13/liftOver
    ln -s /cluster/data/hg13/bed/bedOver/hg13ToHg15.over.chain \
        /gbdb/hg13/liftOver
    hgAddLiftOverChain hg13 hg15


