#
This file describes how we made the browser database on the mouse
genome, February 2002 build.

BREAK UP THE MOUSE SEQUENCE INTO 2 MB CHUNKS AT NON_BRIDGED CONTIGS (done)

o - This version of the mouse sequence data is in /cluster/store2/mm.2002.02/mm2/assembly
o - cd into your CVS source tree under kent/src/hg/splitFaIntoContigs
    - Type make
    - Run 
	gunzip -c /cluster/store2/mm.2002.02/mm2/assembly/*.fasta.gz | splitFaIntoContigs /cluster/store2/mm.2002.02/mm2/assembly/*.agp stdin /cluster/store2/mm.2002.02/mm2 2000000

    - This will split the mouse sequence into approx. 2 Mbase supercontigs between non-bridged clone contigs and drop the resulting dir structure in /cluster/store2/mm.2002.02/mm2.
    - The resulting dir structure will include 1 dir for each chromosome, each of which has a set of subdirectories, one subdir per supercontig.    


COPY THE MOUSE SEQUENCE DATA TO THE CLUSTER (done)
o - ssh kkstore
o - Copy the rna data to the cluster if it isn't there already:
       mkdir /scratch/hg/mrna.128
       cp -r /cluster/store1/mrna.128/org /scratch/hg/mrna.128
o - Copy the mouse sequence supercontigs to the cluster
       mkdir /scratch/hg/mm2/
       mkdir /scratch/hg/mm2/contigs
       cp /cluster/store2/mm.2002.02/mm2/*/chr*/chr*.fa /scratch/hg/mm2/contigs
o - Distribute this data to the local nodes:
       sudo /cluster/install/utilities/localUpdate

REPEAT MASKING (DONE 07/30/02)
   Split contigs, run RepeatMasker, lift results
   Notes: 
   * If there is a new version of RepeatMasker, build it and ask the admins 
     to binrsync it (kkstore:/scratch/hg/RepeatMasker/*).
   * Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make 
     RepeatMasker runs manageable on the cluster ==> results need lifting.
   * For the NCBI assembly we repeat mask on the sensitive mode setting
     (RepeatMasker -m -s)

        #- Split contigs into 500kb chunks:
        cd ~/mm2
        foreach d ( */chr*_?{,?} )
          cd $d
          set contig = $d:t
          faSplit size $contig.fa 500000 ${contig}_ -lift=$contig.lft \
            -maxN=500000
          cd ../..
        end

        #- Make the run directory and job list:
        cd ~/mm2
        mkdir RMRun
        rm -f RMRun/RMJobs
        touch RMRun/RMJobs
        foreach d ( ?{,?}/chr*_?{,?} )
          foreach f ( $d/chr*_*_*.fa )
            set f = $f:t
            echo /cluster/bin/scripts/RMMouse \
                 /cluster/store2/mm.2002.02/mm2/$d $f \
               '{'check out line+ /cluster/store2/mm.2002.02/mm2/$d/$f.out'}' \
              >> RMRun/RMJobs
          end
        end

        #- Do the run
        ssh kk
        cd ~/mm2/RMRun
        para create RMJobs
        para try, para check, para check, para push, para check,...

        #- Lift up the split-contig .out's to contig-level .out's
        cd ~/mm2
        foreach d ( ?{,?}/chr*_?{,?} )
          cd $d
          set contig = $d:t
          liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null
          cd ../..
        end

        #- Lift up the contig-level .out's to chr-level
        cd ~/mm2
        ./jkStuff/liftOut5.sh

        #- Load the .out files into the database with:
        ssh hgwdev
        cd ~/mm2
        hgLoadOut mm2 ?/*.fa.out ??/*.fa.out

        ssh kkstore
        cd ~/mm2
        #- Soft-mask (lower-case) the contig and chr .fa's
        tcsh jkStuff/makeFaMasked.sh
        #- Make hard-masked .fa.masked files as well:
        tcsh jkStuff/makeHardMasked.sh
        #- Rebuild the nib, mixedNib, maskedNib files:
        tcsh jkStuff/makeNib.sh
        #- Rebuild the .zip files
        tcsh jkStuff/zipAll.sh
        #- copy the contig .fa's to the appropriate place on /scratch
        cp -p ?{,?}/chr*/chr?{,?}_?{,?}.fa /scratch/hg/mm2/contigs.0730

        #- Copy the .zip files to hgwdev:/usr/local/apache/...
        ssh hgwdev
        cd ~/mm2
        tcsh jkStuff/cpToWeb.sh


EXTRACT LINEAGE-SPECIFIC REPEATS (ARIAN SMIT''s scripts) (DONE 11/4/02)

    ssh kkstore
    mkdir -p ~/mm2/bed/linSpecRep
    cd ~/mm2/bed/linSpecRep
    foreach f (~/mm2/*/*.out)
        ln -sf $f .
    end
    /cluster/bin/scripts/rodentSpecificRepeats.pl *.out
    /cluster/bin/scripts/perl-rename 's/(\.fa|\.nib)//' *.out.*spec
    /cluster/bin/scripts/perl-rename 's/\.(rod|prim)spec/.spec/' *.out.*spec
    rm *.out
    rm -rf /scratch/hg/mm2/linSpecRep
    cd ..
    cp -R linSpecRep /scratch/hg/mm2
    # Ask cluster-admin@cse.ucsc.edu to binrsync /scratch/hg to clusters


CREATING DATABASE AND STORING mRNA/EST SEQUENCE AND AUXILIARY INFO 

o - Create the database.
     - ssh hgwdev
     - Enter mysql via:
           mysql -u hgcat -pbigsecret
     - At mysql prompt type:
	create database mm1;
	quit
     - make a semi-permanent read-only alias:
        alias mm2 "mysql -u hguser -phguserstuff -A mm2"
o - Use df to ake sure there is at least 5 gig free on hgwdev:/usr/local/mysql 
o - Store the mRNA (non-alignment) info in database.
    (Matt - pleas update this section... )


STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  (done)

Create packed chromosome sequence files 
     ssh kkstore
     cd ~/mm
     tcsh jkStuff/makeNib.sh

Load chromosome sequence info into database and save size info.
     ssh hgwdev
     hgsql mm2  < ~/src/hg/lib/chromInfo.sql
     cd ~/mm
     hgNibSeq -preMadeNib mm2 /cluster/store2/mm.2002.02/mm2/nib ?/chr*.fa ??/chr*.fa 
     mysql -u hguser -phguserstuff -N -e "select chrom,size from chromInfo" hg12 > chrom.sizes

Store o+o info in database.
     cd /cluster/store2/mm.2002.02/mm2
     hgGoldGapGl mm2 /cluster/store2/mm.2002.02 mm2 -noGl

Make and load GC percent table
     ssh hgwdev
     cd /cluster/store2/mm.2002.02/mm2/bed
     mkdir gcPercent
     cd gcPercent
     mysql -A -u hgcat -pbigsecret mm2  < ~/src/hg/lib/gcPercent.sql
     hgGcPercent mm2 ../../nib



MAKING AND STORING mRNA AND EST ALIGNMENTS  (done)

o - Load up the local disks of the cluster with refSeq.fa, mrna.fa and est.fa
    from /cluster/store1/mrna.127  into /var/tmp/hg/h/mrna

o - Use BLAT to generate refSeq, mRNA and EST alignments as so:
      Make sure that /scratch/hg/mm2/contigs is loaded
      with chr*_*.fa and pushed to the cluster nodes.  The following
      cshell script needs updating.

	  cd ~/mm/bed
	  foreach i (refSeq mrna est)
	      mkdir $i
	      cd $i
	      echo /scratch/hg/gs.11/build28/contigs | wordLine stdin > genome.lst
	      ls -1 /scratch/hg/mrna.127/$i.fa > mrna.lst
	      mkdir psl
	      gensub2 genome.lst mrna.lst gsub spec
	      jabba make hut spec
	      jabba push hut
	  end 

    check on progress with jabba check hut in mrna, est, and refSeq
    directories.

      
o - Process refSeq mRNA and EST alignments into near best in genome.
      cd ~/mm/bed

      cd refSeq
      pslSort dirs raw.psl /cluster/fast1/temp psl
      pslReps -minCover=0.2 -sizeMatters -minAli=0.98 -nearTop=0.002 raw.psl contig.psl /dev/null
      liftUp -nohead all_refSeq.psl ../../jkStuff/liftAll.lft warn contig.psl
      pslSortAcc nohead chrom /cluster/fast1/temp all_refSeq.psl
      cd ..

      cd mrna
      pslSort dirs raw.psl /cluster/fast1/temp psl
      pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl /dev/null
      liftUp -nohead all_mrna.psl ../../jkStuff/liftAll.lft warn contig.psl
      pslSortAcc nohead chrom /cluster/fast1/temp all_mrna.psl
      cd ..

      cd est
      pslSort dirs raw.psl /cluster/fast1/temp psl
      pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl /dev/null
      liftUp -nohead all_est.psl ../../jkStuff/liftAll.lft warn contig.psl
      pslSortAcc nohead chrom /cluster/fast1/temp all_est.psl
      cd ..

o - Load mRNA alignments into database.
      ssh hgwdev
      cd /cluster/store2/mm.2002.02/mm2/bed/mrna/chrom
      foreach i (*.psl)
          mv $i $i:r_mrna.psl
      end
      hgLoadPsl mm2 *.psl
      cd ..
      hgLoadPsl mm2 all_mrna.psl -nobin

o - Load EST alignments into database.
      ssh hgwdev
      cd /cluster/store2/mm.2002.02/mm2/bed/est/chrom
      foreach i (*.psl)
            mv $i $i:r_est.psl
      end
      hgLoadPsl mm2 *.psl
      cd ..
      hgLoadPsl mm2 all_est.psl -nobin

o - Create subset of ESTs with introns and load into database.
      - ssh kkstore
      cd ~/mm
      tcsh jkStuff/makeIntronEst.sh
      - ssh hgwdev
      cd ~/mm/bed/est/intronEst
      hgLoadPsl mm2 *.psl

o - Load refSeq alignments into database
      ssh hgwdev
      cd ~/mm/bed/refSeq
      pslCat -dir chrom > refSeqAli.psl
      hgLoadPsl hg10 -tNameIx refSeqAli.psl


PRODUCING ESTORIENTINFO TABLE

This table is needed for proper orientation of ESTs in the
browser.  Many will appear on the wrong strand without it.
This involves a cluster run.  First load the EST psl files
as so:
     ssh kkstore
     cd ~/mm/bed/est
     pslSortAcc nohead contig /cluster/fast1/temp contig.psl
     mkdir /scratch/hg/mm2/est
     cp -r contig /scratch/hg/mm2/est
     sudo /cluster/install/utilities/updateLocal
Wait for these to finish.
     cd ..
     mkdir estOrientInfo
     cd estOrientInfo
     mkdir ei
     ls -1S /scratch/hg/mm2/est/contig > psl.lst
     cp ~/lastMm/bed/estOrientInfo/gsub .
Update gsub to refer to mouse contig sequence currently on
/scratch, and mouse ESTs on /scratch.
     gensub2 psl.lst single gsub spec
     para create spec
Then run the  job on the cluster
     ssh kk
     cd ~/mm/bed/estOrientInfo
     para try
     sleep 60
     para check
If things look good
     para push
Wait for this to finish then
     liftUp estOrientInfo.bed ../../jkStuff/liftAll.lft warn ei/*.tab
Load them into database as so:
     ssh hgwdev
     cd ~/mm/bed/estOrientInfo
     hgLoadBed mm2 estOrientInfo estOrientInfo.bed -sqlTable=/cluster/home/kent/src/hg/lib/estOrientInfo.sql
     
CREATE RNACLUSTER TABLE (done)
 Make sure that refSeqAli and estOrientInfo tables are made already
 (see above).

   ssh hgwdev
   cd ~/mm/bed
   mkdir rnaCluster
   cd rnaCluster
   mkdir rna est
   foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Un)
       clusterRna mm2 rna/chr$i.bed est/chr$i.bed -chrom=chr$i
       echo done $i
   end
   hgLoadBed mm2 rnaCluster est/*.bed


PRODUCING KNOWN GENES (done)

o - Download everything from ftp://ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/
    into ~/mm/bed/refSeq
o - Unpack this into fa files and get extra info with:
       cd ~/mm/bed/refSeq
       gunzip mouse.faa.gz
       gunzip mouse.gbff.gz
       gbToFaRa ~/hg/h/allRna.fil refSeq.fa refSeq.ra refSeq.ta mouse.gbff
o - Get extra info from NCBI and produce refGene table as so:
       wget ftp://ncbi.nlm.nih.gov/refseq/LocusLink/loc2ref 
       wget ftp://ncbi.nlm.nih.gov/refseq/LocusLink/mim2loc
o - Produce refGenn, refPep, refMrna, and refLink tables as so:
       hgRefSeqMrna mm2 refSeq.fa refSeq.ra all_refSeq.psl loc2ref mouse.faa mim2loc
o - Add RefSeq status info (done 6/19/02)
    hgRefSeqStatus mm2 loc2ref

REFFLAT

o - create precomputed join of refFlat and refGene:
      echo 'CREATE TABLE refFlat (KEY geneName (geneName), KEY name (name), KEY chrom (chrom)) SELECT refLink.name as geneName, refGene.* FROM refLink,refGene WHERE refLink.mrnaAcc = refGene.name' | hgsql mm2

SIMPLE REPEAT TRACK (done)

o - Create cluster parasol job like so:
        ssh kk
	cd ~/mm/bed
	mkdir simpleRepeat
	cd simpleRepeat
	cp ~/lastOo/bed/simpleRepeat/gsub
	mkdir trf
	ls -1 /scratch/hg/mm2/contigs/*.fa > genome.lst
	gensub2 genome.lst single gsub spec
	para make spec
        para push 
     When job is done do:
        liftUp simpleRepeat.bed ~/mm/jkStuff/liftAll.lft warn trf/*.bed

o - Load this into the database as so
        ssh hgwdev
	cd ~/mm/bed/simpleRepeat
	hgLoadBed mm2 simpleRepeat simpleRepeat.bed -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql


LOADING MOUSE MM2 HUMAN BLASTZ ALIGNMENTS FROM PENN STATE: (IN PROGRESS: generated 12/7/02, not loaded into db)

    # Translate Penn State .lav files into sorted axt:
    ssh kkstore
    set base="/cluster/store2/mm.2002.02/mm2/bed/blastz.gs14.2002-12-6-ASH"
    set seq1_dir="/cluster/store2/mm.2002.02/mm2/trfMixedNib/"
    set seq2_dir="/cluster/store4/gs.14/build31/mixedNib/"
    set tbl="blastzHg13"
    cd $base
    mkdir -p axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=$base/axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | lavToAxt stdin $seq1_dir $seq2_dir stdout \
        | axtSort stdin $out
      popd
    end

    # Translate the sorted axt files into psl:
    cd $base
    mkdir -p pslChrom
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # Load tables
    ssh hgwdev
    set base="/cluster/store2/mm.2002.02/mm2/bed/blastz.gs14.2002-12-6-ASH"
    set tbl="blastzHg13"
    cd $base/pslChrom
    hgLoadPsl mm2 chr*_${tbl}.psl

MAKING THE BLASTZBESTHUMAN TRACK FROM PENN STATE MM2 AXT FILES (IN PROGRESS: generated 12/7/02, not loaded into db)

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh kkstore
    set base="/cluster/store2/mm.2002.02/mm2/bed/blastz.gs14.2002-12-6-ASH"
    set seq1_dir="/cluster/store2/mm.2002.02/mm2/trfMixedNib/"
    set seq2_dir="/cluster/store4/gs.14/build31/mixedNib/"
    set tbl="blastzBestHuman"
    cd $base
    mkdir -p axtBest pslBest
    foreach chrdir (lav/chr*)
      set chr=$chrdir:t
      echo axtBesting $chr
      axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300
      echo translating axtBest to psl for $chr
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
    end
    # If a chromosome has so many alignments that axtBest runs out of mem,
    # run axtBest in 2 passes to reduce size of the input to final axtBest:
    foreach chrdir (lav/chr7)
      set chr=$chrdir:t
      echo two-pass axtBesting $chr
      foreach d ($chrdir/*.lav)
        set smallout=$d.axt
        lavToAxt $d $seq1_dir $seq2_dir stdout \
        | axtSort stdin $smallout
      end
      foreach a ($chrdir/*.axt)
        axtBest $a $chr $a:r.axtBest
      end
      cat `ls -1 $chrdir/*.axtBest | sort -g` \
        > $chrdir/$chr.axtBestPieces
      axtBest $chrdir/$chr.axtBestPieces $chr axtBest/$chr.axt
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
    end

    # Load tables
     ssh hgwdev
     set base="/cluster/store2/mm.2002.02/mm2/bed/blastz.gs14.2002-12-6-ASH"
     set tbl="blastzBestHuman"
     cd $base/pslBest
     hgLoadPsl mm2 chr*_${tbl}.psl

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/mm2/axtBestHg13
     cd /gbdb/mm2/axtBestHg13
     foreach f ($base/axtBest/chr*.axt)
       ln -s $f .
     end
     cd $base/axtBest
     rm -f axtInfoInserts.sql
     touch axtInfoInserts.sql
     foreach f (/gbdb/mm2/axtBestHg13/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo VALUES ('hg13','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
     hgsql mm2 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql mm2 < axtInfoInserts.sql

MAKING THE HUMAN AXTTIGHT FROM AXTBEST (IN PROGRESS: generated 12/7/02, not loaded into db)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh kkstore
    cd ~/mm2/bed/blastz.gs14.2002-12-6-ASH/axtBest
    mkdir -p ../axtTight
    foreach i (*.axt)
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end
    # translate to psl
    cd ../axtTight
    mkdir -p ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightHuman.psl
    end
    # Load tables into database
    ssh hgwdev
    cd ~/mm2/bed/blastz.gs14.2002-12-6-ASH/pslTight
    hgLoadPsl mm2 chr*_blastzTightHuman.psl


LOADING MOUSE MM2 RAT BLASTZ ALIGNMENTS FROM PENN STATE: (DONE 1/9/03)

    # Translate Penn State .lav files into sorted axt:
    ssh kkstore
    set base="/cluster/store2/mm.2002.02/mm2/bed/blastz.rn1.2003-01-09-ASH"
    set seq1_dir="/cluster/store2/mm.2002.02/mm2/trfMixedNib/"
    set seq2_dir="/cluster/store4/rn1/mixedNib/"
    set tbl="blastzRn1"
    cd $base
    mkdir -p axtChrom
    # Some chromosomes have so many alignments that axtSort runs out of mem,
    # so generate a sorted .axt for each small .lav chunk, then cat a sorted 
    # list of chunk .axt files together to make the chrom .axt:
    foreach c (lav/chr*)
      pushd $c
      set chr=$c:t
      set out=$base/axtChrom/$chr.axt
      echo two-pass lavToAxting $chr
      foreach d (*.lav)
        set smallout=$d.axt
        lavToAxt $d $seq1_dir $seq2_dir stdout \
        | axtSort stdin $smallout
      end
      cat `ls -1 *.lav.axt | sort -g` \
        > $out
      popd
    end

    # Mouse-rat alignments are quite large, and the unfiltered .axt's are 
    # not used as often (or by the browser) as the axtBest .axt's... so 
    # compress them to save disk space:
    cd $base/axtChrom
    gzip chr*.axt

    # Translate the sorted axt files into psl:
    cd $base
    mkdir -p pslChrom
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r:r
       axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # Load tables
    ssh hgwdev
    set base="/cluster/store2/mm.2002.02/mm2/bed/blastz.rn1.2003-01-09-ASH"
    set tbl="blastzRn1"
    cd $base/pslChrom
    hgLoadPsl mm2 chr*_${tbl}.psl

MAKING THE BLASTZBESTRAT TRACK FROM PENN STATE MM2 AXT FILES (DONE 1/9/03)

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh kkstore
    set base="/cluster/store2/mm.2002.02/mm2/bed/blastz.rn1.2003-01-09-ASH"
    set seq1_dir="/cluster/store2/mm.2002.02/mm2/trfMixedNib/"
    set seq2_dir="/cluster/store4/rn1/mixedNib/"
    set tbl="blastzBestRat"
    cd $base
    mkdir -p axtBest pslBest
    # Again, run in 2 passes (axtBest on small chunks, then axtBest on 
    # those results to resolve overlaps) to avoid running axtBest out of mem.
    foreach chrdir (lav/chr*)
      set chr=$chrdir:t
      echo two-pass axtBesting $chr
      foreach a ($chrdir/*.lav.axt)
        axtBest $a $chr $a:r.axtBest
      end
      cat `ls -1 $chrdir/*.axtBest | sort -g` | \
        axtBest stdin $chr axtBest/$chr.axt
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
    end

    # Now clean up chunk .axt files to save disk space:
    cd $base
    rm lav/chr*/*.lav.axt*

    # Load tables
     ssh hgwdev
     set base="/cluster/store2/mm.2002.02/mm2/bed/blastz.rn1.2003-01-09-ASH"
     set tbl="blastzBestRat"
     cd $base/pslBest
     hgLoadPsl mm2 chr*_${tbl}.psl

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/mm2/axtBestRn1
     cd /gbdb/mm2/axtBestRn1
     rm -f *
     foreach f ($base/axtBest/chr*.axt)
       ln -s $f .
     end
     cd $base/axtBest
     rm -f axtInfoInserts.sql
     touch axtInfoInserts.sql
     foreach f (/gbdb/mm2/axtBestRn1/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo VALUES ('rn1','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
     hgsql mm2 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql mm2 < axtInfoInserts.sql

MAKING THE RAT AXTTIGHT FROM AXTBEST (DONE 1/9/03)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh kkstore
    cd ~/mm2/bed/blastz.rn1.2003-01-09-ASH/axtBest
    mkdir -p ../axtTight
    foreach i (*.axt)
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end
    # translate to psl
    cd ../axtTight
    mkdir -p ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightRat.psl
    end
    # Load tables into database
    ssh hgwdev
    cd ~/mm2/bed/blastz.rn1.2003-01-09-ASH/pslTight
    hgLoadPsl mm2 chr*_blastzTightRat.psl


PRODUCING GENSCAN PREDICTIONS (done)
    
o - Produce contig genscan.gtf genscan.pep and genscanExtra.bed files like so:

	First make sure you have appropriate set up, permissions, etc.
	and you have tried using Parasol to submit and finished a set 
	of jobs successfully.
	
	Load up the cluster with hard-masked contigs in
	   /scratch/hg/mm2/mContigs
     	
	Log into kkr1u00 (not kk!).  kkr1u00 is the driver node for the small
	cluster (kkr2u00 -kkr8u00. (genscan has problem running on the
	big cluster, due to limitation of memory and swap space on each
	processing node).
	     	cd ~/mm
     		cd bed/genscan
	Make 3 subdirectories for genscan to put their output files in
		mkdir gtf pep subopt
	Generate a list file, genome.list, of all the contigs
		ls -1S /cluster/store2/mm.2002.02/mm2/mContigs/* >genome.list
	Edit genome.list to remove jobs on all 20 files of chr??_1.fa.masked.
	Those files have pure Ns due to heterochromatin (unsequencable
	stuff) and will cause genscan to run forever.
	
	Create template file, gsub, for gensub2.  For example (3 lines file):
		#LOOP
		/cluster/home/fanhsu/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000
		#ENDLOOP
	Create a file containing a single line.
		echo single > single
	Generate job list file, jobList, for Parasol
		gensub2 genome.list single gsub jobList
	
	Edit jobList to find the line containing "chr12_5.fa.masked"
	and change "-window=2400000" into "-window=1200000", because
	genscan has "not enough memory" problem with this particular
	file.
	
	First issue the following Parasol command:
		para create jobList
	Run the following command, which will try first 10 jobs from jobList
		para try
	Check if these 10 jobs run OK by
		para check
	If they have problems, debug and fix your program, template file, 
	commands, etc. and try again.  If they are OK, then issue the following 
	command, which will ask Parasol to start all the remaining jobs (around
	~252 jobs).
		para push
	Issue either one of the following two commands to check the 
	status of the cluster and your jobs, until they are done.
		parasol status
		para check
	If any job fails to complete, study the problem and ask Jim to help
	if necessary.

o - Convert these to chromosome level files as so:
     cd ~/mm
     cd bed/genscan
     liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
     liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
     cat pep/*.pep > genscan.pep

o - Load into the database as so:
     ssh hgwdev
     cd ~/mm/bed/genscan
     ldHgGene mm2 genscan genscan.gtf
     hgPepPred mm2 generic genscanPep genscan.pep
     hgLoadBed mm2 genscanSubopt genscanSubopt.bed


TWINSCAN GENE PREDICTIONS (done 6/10/02; reloaded 12/3/02)

    mkdir -p ~/mm2/bed/twinscan
    cd ~/mm2/bed/twinscan
    mv Gtf.tgz Gtf.020610.tgz
    mv Ptx.tgz Ptx.020610.tgz
    rm chr*.gtf chr*.ptx chr*.fa *.tab
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X)
      wget http://genes.cs.wustl.edu/mouse/12-3-02/gtf/chr$c.gtf
      wget http://genes.cs.wustl.edu/mouse/12-3-02/ptx/chr$c.ptx
    end
    ldHgGene mm2 twinscan chr*.gtf -exon=CDS
    - pare down to id:
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X)
      perl -wpe 's/^\>.*\s+source_id\s*\=\s*(\S+).*$/\>$1/;' < \
        chr$c.ptx > chr$c-fixed.fa
    end
    hgPepPred mm2 generic twinscanPep chr*-fixed.fa

NCBI GENE MODELS (done 05/31/02)

    mkdir -p ~/mm2/bed/ncbiGenes
    cd ~/mm2/bed/ncbiGenes
    wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/maps/chr_genes.gtf.gz
    wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/protein/protein.fa.gz
    gunzip chr_genes.gtf.gz
    gunzip protein.fa.gz
    - Process the .gtf and .fa together to join IDs
    ../../jkStuff/mungeNCBIids chr_genes.gtf protein.fa |& uniq
    ldHgGene mm2 ncbiGenes chr_genes-fixed.gtf
    hgPepPred mm2 generic ncbiPep protein-fixed.fa

NCBI GENOMESCAN MODELS (done 05/31/02)

    mkdir -p ~/mm2/bed/genomeScan
    cd ~/mm2/bed/genomeScan
    wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/maps/chr_GenomeScan.gtf.gz
    - Remove the ".1" at the end of transcript_id's:
    gunzip -c chr_GenomeScan.gtf.gz | \
      perl -wpe 's/transcript_id "([^\"]+)\.1"/transcript_id "$1"/' > \
      chr_GenomeScan-fixed.gtf
    ldHgGene mm2 genomeScan chr_GenomeScan-fixed.gtf
    wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/protein/GS_prot.fsa.gz
    hgPepPred mm2 generic genomeScanPep GS_prot.fsa

PREPARING SEQUENCE FOR CROSS SPECIES ALIGNMENTS (DONE 8/02/02)

Make sure that the contig files are lower-case repeat masked
then do
   ssh kkstore
   cd ~/mm
   source jkStuff/makeTrfFa.sh
Then make sure there is enough space available on /scratch
and do
   cp -Rp ~/mm/trfFa /scratch/hg/mm2/trfFa.0802
   # sudo /cluster/install/utilities/updateLocal


PREPARING POST-TRF CHROM-LEVEL MIXED NIBs for blastz (DONE 11/6/02)

    # lift trfMask output to chrom-level... this is a pain because all 
    # trf output was put in the same dir.  maybe next time around, we 
    # can preserve chrom dir structure...
    ssh kkstore
    cd ~/mm2
    foreach c (?{,?})
     if (-e $c/lift/ordered.lst) then
       set ntlist = ()
       foreach n (`cat $c/lift/ordered.lst`)
         set ntlist = ($ntlist bed/simpleRepeat/trf/$n.bed)
       end
       liftUp $c/chr$c.trf.bed jkStuff/liftAll.lft warn $ntlist
     endif
    end
    # make trf-masked chrom-level .fa
    foreach c (?{,?})
      cd $c
      if (-e chr$c.trf.bed) then
        echo masking $c...
        cp chr$c.fa chr$c.trf.fa
        maskOutFa -softAdd chr$c.trf.fa chr$c.trf.bed chr$c.trf.fa
      endif
      cd ..
    end
    # make nib
    mkdir trfMixedNib
    foreach c (?{,?})
      if (-e $c/chr$c.trf.fa) then
        faToNib -softMask $c/chr$c.trf.fa trfMixedNib/chr$c.nib
      endif
    end
    rm -rf /scratch/hg/mm2/chromTrfMixedNib
    cp -pR trfMixedNib /scratch/hg/mm2/chromTrfMixedNib


DOING HUMAN/MOUSE ALIGMENTS (todo)

o - Download the lower-case-masked assembly and put it in
    kkstore:/cluster/store1/a2ms.
   
o - Download the assembled mouse genome in lower-case
    masked form to /cluster/store1/arachne.3/whole.  
    Execute the script splitAndCopy.csh to chop it
    into roughly 50M pieces in arachne.3/parts
o - Set up the jabba job to do the alignment as so:
       ssh kkstore
       cd /cluster/store2/mm.2002.02/mm2
       mkdir blatMouse.phusion
       cd blatMouse.phusion
       ls -1S /scratch/hg/gs.3/build28/contigTrf/* > human.lst
       ls -1 /cluster/store1/arachne.3/parts/* > mouse.lst
    Make a file 'gsub' with the following three lines in it
#LOOP
/cluster/home/kent/bin/i386/blat -q=dnax -t=dnax {check in line+ $(path2)} {check in line+ $(path1)}  {check out line+ psl/$(root2)_$(root1).psl} -minScore=20 -minIdentity=20 -tileSize=4 -minMatch=2 -oneOff=0 -ooc={check in exists /scratch/hg/h/4.pooc} -qMask=lower -mask=lower
#ENDLOOP
    Process this into a jabba file and launch the first set
    of jobs (10,000 out of 70,000) as so:
        gensub2 mouse.lst human.lst gsub spec
	jabba make hut spec
	jabba push hut
    Do a 'jabba check hut' after about 20 minutes and make sure
    everything is right.  After that make a little script that
    does a "jabba push hut" followed by a "sleep 30" about 50
    times.  Interrupt script when you see jabba push say it's
    not pushing anything.

o - Sort alignments as so 
       ssh kkstore
       cd /cluster/store2/mm.2002.02/mm2/blatMouse
       pslCat -dir -check psl | liftUp -type=.psl stdout ../liftAll.lft warn stdin | pslSortAcc nohead chrom /cluster/store2/temp stdin
o - Get rid of big pile-ups due to contamination as so:
       cd chrom
       foreach i (*.psl)
           echo $i
           mv $i xxx
           pslUnpile -maxPile=600 xxx $i
       rm xxx
       end
o - Remove long redundant bits from read names by making a file
    called subs.in with the following line:
        gnl|ti^ti
        contig_^tig_
    and running the commands
        cd ~/mouse/vsOo33/blatMouse.phusion/chrom
	subs -e -c ^ *.psl > /dev/null
o - Copy over to network where database is:
        ssh kks00
	cd ~/mm/bed
	mkdir blatMouse
	mkdir blatMouse/ph.chrom600
	cd !$
        cp /cluster/store2/mm.2002.02/mm2/blatMouse.phusion/chrom/*.psl .
o - Rename to correspond with tables as so and load into database:
       ssh hgwdev
       cd ~/mm/bed/blatMouse/ph.chrom600
       foreach i (*.psl)
	   set r = $i:r
           mv $i ${r}_blatMouse.psl
       end
       hgLoadPsl mm2 *.psl
o - load sequence into database as so:
	ssh kks00
	faSplit about /projects/hg3/mouse/arachne.3/whole/Unplaced.mfa 1200000000 /projects/hg3/mouse/arachne.3/whole/unplaced
	ssh hgwdev
	hgLoadRna addSeq '-abbr=gnl|' mm2 /projects/hg3/mouse/arachne.3/whole/unpla*.fa
	hgLoadRna addSeq '-abbr=con' mm2 /projects/hg3/mouse/arachne.3/whole/SET*.mfa
    This will take quite some time.  Perhaps an hour .

o - Produce 'best in genome' filtered version:
        ssh kks00
	cd ~/mouse/vsOo33
	pslSort dirs blatMouseAll.psl temp blatMouse
	pslReps blatMouseAll.psl bestMouseAll.psl /dev/null -singleHit -minCover=0.3 -minIdentity=0.1
	pslSortAcc nohead bestMouse temp bestMouseAll.psl
	cd bestMouse
        foreach i (*.psl)
	   set r = $i:r
           mv $i ${r}_bestMouse.psl
        end
o - Load best in genome into database as so:
	ssh hgwdev
	cd ~/mouse/vsOo33/bestMouse
        hgLoadPsl mm2 *.psl

PRODUCING CROSS_SPECIES mRNA ALIGMENTS (done)

Here you align vertebrate mRNAs against the masked genome on the
cluster you set up during the previous step.

Make sure that gbpri, gbmam, gbrod, and gbvert are downloaded from Genbank into
/cluster/store1/genbank.128 and unpacked by organism into /cluster/store1/mrna.128/org. 

Set up cluster run more or less as so:
      ssh kk
      cd ~/mm/bed
      mkdir xenoMrna
      cd xenoMrna
      ls -1S /scratch/hg/mm2/mContigs/* > genome.lst
      ls -1S /scratch/hg/mrna.128/org/*/mrna.fa > allMrna
Then edit allMrna removing the Mus.musculus line,  and writing
the first line into 1.org, the second line into 2.org,  and
so forth.  After the 6th line just leave the rest in 7.org.
Then
      ls -1 *.org > rna.lst
      cp ~/mm/bed/xenoMrna/gsub .
      gensub2 genome.lst rna.lst gsub spec
      para create
      para try
      para check
If all looks well do
      para push.

Sort xeno mRNA alignments as so:
       ssh kkstore
       cd ~/mm/bed/xenoMrna
       pslSort dirs raw.psl /cluster/store2/temp psl
       pslReps raw.psl cooked.psl /dev/null -minAli=0.25
       liftUp chrom.psl ../../jkStuff/liftAll.lft warn cooked.psl
       pslSortAcc nohead chrom /cluster/store2/temp chrom.psl
       pslCat -dir chrom > xenoMrna.psl
       rm -r chrom raw.psl cooked.psl chrom.psl

Load into database as so:
       ssh hgwdev
       cd ~/mm/bed/xenoMrna
       hgLoadPsl mm2 xenoMrna.psl -tNameIx
Load other RNA into database as so:
       cd /cluster/store1/mrna.128/topOrg
Note - need to describe how topOrg was made.  See topOrg/README...
       foreach i (*/mrna.fa)
	   hgLoadRna add mm2 /cluster/store1/mrna.128/org/$i $i:r.ra -type=$i:r
	   echo done $i
	end


PRODUCING TETRAODON FISH ALIGNMENTS (done)

o - Download sequence from ... and put it on the cluster local disk
    at
       /scratch/hg/fish
o - Do fish/mouse alignments.
       ssh kk
       cd ~/mm/bed
       mkdir blatFish
       cd blatFish
       mkdir psl
       ls -1S /scratch/hg/fish/* > fish.lst
       ls -1S /scratch/hg/mm2/trfFa/* > mouse.lst
       cp ~/lastMm/blatFish/gsub .
       gensub2 mouse.lst fish.lst gsub spec
       para create spec
       para try
     Make sure jobs are going ok with para check.  Then
       para push
     wait about 2 hours and do another
       para push
     do para checks and if necessary para pushes until done
     or use para shove.
o - Sort alignments as so 
       pslCat -dir psl | liftUp -type=.psl stdout ~/mm/jkStuff/liftAll.lft warn stdin | pslSortAcc nohead chrom /cluster/fast1/temp stdin
o - Copy to hgwdev:/scratch.  Rename to correspond with tables as so and 
    load into database:
       ssh hgwdev
       cd ~/mm/bed/blatFish/chrom
       foreach i (*.psl)
	   set r = $i:r
           mv $i ${r}_blatFish.psl
       end
       hgLoadPsl mm2 *.psl
       hgLoadRna addSeq mm2 /cluster/store2/fish/seq15jun2001/*.fa


PRODUCING FUGU FISH ALIGNMENTS (Done 10/21/02 by Matt)

o - Download sequence to /cluster/store3/fuguSeq from ... and put it on the cluster local disk
    at /scratch/hg/fugu on kkstore.
Sequence was downloaded from:
ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/fugu_v3_mask.fasta.Z
ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/fugu_v3_prot.fasta.Z

faSplit sequence ../fugu_v3_mask.fasta 1000 fuguSplit
o - Do fish/mouse alignments.
       ssh kk
       cd ~/mm/bed
       mkdir blatFugu
       cd blatFugu
       mkdir psl
       ls -1S /scratch/hg/fugu/* > fugu.lst
       ls -1S /scratch/hg/mm2/trfFa.0802/* > mouse.lst
       # Run mkdirs.sh
       # Edit gsub to fit the dir srtucture
       gensub2 mouse.lst fugu.lst gsub spec
       para create spec
       para try
     Make sure jobs are going ok with para check.  Then
       para push
     wait about 2 hours and do another
       para push
       do para checks and if necessary para pushes until done
     or use para shove.
o - Sort alignments as so 
       pslCat -dir psl/* | liftUp -type=.psl stdout ~/mm2/jkStuff/liftAll.lft warn stdin | pslSortAcc nohead chrom /oldscratch stdin
o - ssh hgwdev
    load into database:
       ssh hgwdev
       cd ~/mm2/bed/blatFugu/chrom
       foreach i (*.psl)
	   set r = $i:r
           mv $i ${r}_blatFugu.psl
       end
       hgLoadPsl mm2 *.psl
       hgLoadRna addSeq mm2 /cluster/store3/fuguSeq/fugu_v3_mask.fasta


LOAD GENEID GENES (done)
     cd ~/mm/bed
     mkdir geneid
     cd geneid
     mkdir download
     cd download
   Now download *.gtf and *.prot from 
   http://www1.imim.es/genepredictions/M.musculus/mmFeb2002/geneid_v1.1
   Get rid of the extra .N in the transcripts with subs.  
     cd ..
     cp ~/lastMm/bed/geneid/subs .
     subs -e download/*.gtf > /dev/null
     ldHgGene mm2 geneid download/*.gtf -exon=CDS
     hgPepPred mm2 generic geneidPep download/*.prot

SGP GENE PREDICTIONS (DONE 01/29/03)
    mkdir -p ~/mm2/bed/sgp/download
    cd ~/mm2/bed/sgp/download
    foreach f (~/mm2/?{,?}/chr?{,?}{,_random}.fa)
      set chr = $f:t:r
      wget http://genome.imim.es/genepredictions/M.musculus/mmFeb2002/SGP/humangp20021114/$chr.gtf
      wget http://genome.imim.es/genepredictions/M.musculus/mmFeb2002/SGP/humangp20021114/$chr.prot
    end
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene mm2 sgpGene download/*.gtf -exon=CDS
    hgPepPred mm2 generic sgpPep download/*-fixed.prot


TIGR GENE INDEX (REDONE 3/31/03)
  o mkdir -p ~/mm2/bed/tigr    
    cd ~/mm2/bed/tigr  
    wget ftp://ftp.tigr.org/private/NHGI_mgi_jiashu/TGI_track_MouseGenome_Feb2003.tgz
    tar xvzf TGI*.tgz
    foreach f (*cattle*)
      set f1 = `echo $f | sed -e 's/cattle/cow/g'`
      mv $f $f1
    end

    foreach o (mouse cow human pig rat)
      setenv O $o
      foreach f ([Cc]hr*_$o*s)
       tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
      end
    end
    ldHgGene -exon=TC mm2 tigrGeneIndex *.gff


LOAD STS MAP (todo)
     - login to hgwdev
      cd ~/mm/bed
      mm2 < ~/src/hg/lib/stsMap.sql
      mkdir stsMap
      cd stsMap
      bedSort /projects/cc/hg/mapplots/data/tracks/build28/stsMap.bed stsMap.bed
      - Enter database with "mm2" command.
      - At mysql> prompt type in:
          load data local infile 'stsMap.bed' into table stsMap;
      - At mysql> prompt type

LOAD MGI IDs (done)
      - The Locuslink ID to MGI IDs converstion data file,
        LL2MGI.txt, from Jackson Lab should be found under
	~/mm/bed/refSeq
      - login to hgwdev
      
      cd ~/mm/bed/refSeq
      mm2 < ~/src/hg/lib/mgiID.sql
      - Enter database with "mm2" command.
      - At mysql> prompt type in:
          load data local infile 'LL2MGI.txt' into table MGIid;
      - At mysql> prompt type
          quit

LOAD CHROMOSOME BANDS (todo)
      - login to hgwdev
      cd /cluster/store2/mm.2002.02/mm2/bed
      mkdir cytoBands
      cp /projects/cc/hg/mapplots/data/tracks/build28/cytobands.bed cytoBands
      mm2 < ~/src/hg/lib/cytoBand.sql
      Enter database with "mm2" command.
      - At mysql> prompt type in:
          load data local infile 'cytobands.bed' into table cytoBand;
      - At mysql> prompt type
          quit

LOAD MOUSEREF TRACK (todo)
    First copy in data from kkstore to ~/mm/bed/mouseRef.  
    Then substitute 'genome' for the appropriate chromosome 
    in each of the alignment files.  Finally do:
       hgRefAlign webb mm2 mouseRef *.alignments

LOAD AVID MOUSE TRACK (todo)
      ssh cc98
      cd ~/mm/bed
      mkdir avidMouse
      cd avidMouse
      wget http://pipeline.lbl.gov/tableCS-LBNL.txt
      hgAvidShortBed *.txt avidRepeat.bed avidUnique.bed
      hgLoadBed avidRepeat avidRepeat.bed
      hgLoadBed avidUnique avidUnique.bed

LOAD SNPS (Done.  Daryl Thomas August 16, 2002)
      - ssh hgwdev
      - cd ~/mm/bed
      - mkdir snp
      - cd snp
      - Download SNPs from ftp://ftp.ncbi.nlm.nih.gov/pub/sherry/mouse.b27.out.gz
      - Unpack.
        createBed < mouse.b27.out > snpNih.bed
        hgLoadBed mm2 snpNih snpNih.bed

LOAD CPGISSLANDS (todo)
     login to hgwdev
       cd /cluster/store2/mm.2002.02/mm2/bed
       mkdir cpgIsland
       cd cpgIsland
     Get cpgisland tarball out of email from Asif (achinwal@watson.wustl.edu)
     and unpack it.
       awk -f filter.awk */ctg*/*.cpg > cpgIsland.bed
       mysql -u hgcat -pBIGSECRET -A mm2 < ~/src/hg/lib/cpgIsland.sql
       mysql -u hgcat -pBIGSECRET -A mm2 
     At mysql> prompt type in:
       load data local infile 'cpgIsland.bed' into table cpgIsland

LOAD ENSEMBL ESTs (done 05/28/02, reloaded w/new data 08/05/02)
     ln -s /cluster/store2/mm.2002.02/mm2 ~/mm2
     mkdir -p ~/mm2/bed/ensembl
     cd ~/mm2/bed/ensembl
     wget http://www.ebi.ac.uk/~stabenau/mouse-est.gz
     wget http://www.ebi.ac.uk/~stabenau/mouse-est.pep.gz
     gunzip -c mouse-est.gz | \
       perl -w -p -e 's/^(\w)/chr$1/' > mouse-est-fixed.gtf
     ldHgGene mm2 ensEst mouse-est-fixed.gtf
> The id behind '>' is internal and was not in our gtf dump, so
> you have to do some more parsing.
     # pick out the transcript= attribute -- that's the id to use:
     # also remove the first line:
     gunzip -c mouse-est.pep.gz | tail +2 | \
       perl -w -p -e 's/^\>gene_id=.*transcript=(\w+)\s+.*$/\>$1/' > \
       mouse-est-fixed.pep
     hgPepPred mm2 generic ensEstPep mouse-est-fixed.pep

LOAD ENSEMBLE GENES (done 05/28/02, reloaded w/new data 08/05/02)
     mkdir -p ~/mm2/bed/ensembl
     cd ~/mm2/bed/ensembl
     wget http://www.ebi.ac.uk/~stabenau/mouse-ensembl.gz
     wget http://www.ebi.ac.uk/~stabenau/mouse-ensembl.pep.gz
     gunzip -c mouse-ensembl.gz | \
       perl -w -p -e 's/^(\w)/chr$1/' > mouse-ensembl-fixed.gtf
     ldHgGene mm2 ensGene mouse-ensembl-fixed.gtf
> mouse-ensembl contains stopcodons, due to some glitches in our
> genebuild. The id behind '>' is internal and was not in our gtf dump, so
> you have to do some more parsing.
# pick out the transcript= attribute -- that's the id to use:
# also remove the first line:
     tail +2 mouse-ensembl.pep | \
       perl -w -p -e 's/^\>gene_id=.*transcript=(\w+)\s+.*$/\>$1/' > \
       mouse-ensembl-fixed.pep
     hgPepPred mm2 generic ensPep mouse-ensembl-fixed.pep

LOAD ENSEMBL "Merge" TRACKs - SECRET! (done 6/25/02)
     - Use mgsc database, not mm2.  Only MGSC members should be able to 
       access this track, and only by password protection.
     mkdir -p ~/mm2/bed/ensembl
     cd ~/mm2/bed/ensembl
     foreach tier (b c d)
       GET http://www.ebi.ac.uk/~stabenau/tier$tier.gtf.gz > tier$tier.gtf.gz
       GET http://www.ebi.ac.uk/~stabenau/mouse_tier$tier.fa.gz > tier$tier.fa.gz
       gunzip -c tier$tier.gtf.gz | \
         perl -w -p -e 's/^(\w)/chr$1/' > tier$tier-fixed.gtf
       gunzip -c tier$tier.fa.gz | \
         perl -w -p -e 's/^\>.*source_id=(\S+)\s+.*$/\>$1/' > \
         tier$tier-fixed.pep
       set Tier = `echo $tier | tr 'a-z' 'A-Z'`
       ldHgGene mgsc ensMergeTier$Tier tier$tier-fixed.gtf
       hgPepPred mgsc generic ensMergeTier${Tier}Pep tier$tier-fixed.pep
     end
     NOTE: because this track contains ensRiken transcripts, it had to 
     be made secret - see NOTE's below & revision history comments for 
     hgTracks.c 1.277.

LOAD ENSEMBL/RIKEN - SECRET! (05/31/02 - pep todo, reloaded w/new data 08/05/02)
     - Use mgsc database, not mm2.  Only MGSC members should be able to 
       access this track, and only by password protection.
     mkdir -p ~/mm2/bed/ensRiken
     cd ~/mm2/bed/ensRiken
     wget http://www.ebi.ac.uk/~stabenau/mouse-riken.gz
     wget http://www.ebi.ac.uk/~stabenau/???
     gunzip -c mouse-riken.gz | \
       perl -w -p -e 's/^(\w)/chr$1/' > mouse-riken-fixed.gtf
     ldHgGene mgsc ensRiken mouse-riken-fixed.gtf
> The id behind '>' is internal and was not in our gtf dump, so
> you have to do some more parsing.
     # pick out the transcript= attribute -- that's the id to use:
     # also remove the first line:
     gunzip -c mouse-riken.pep.gz | tail +2 | \
       perl -w -p -e 's/^\>gene_id=.*transcript=(\w+)\s+.*$/\>$1/' > \
       mouse-riken-fixed.pep
     hgPepPred mgsc generic ensRikenPep mouse-riken-fixed.pep
     - NOTE: hooks had to be added to hgTracks.c to enable/disable this 
       track together with the main riken track.  see revision history 
       comments for hgTracks.c version 1.262 .
     - NOTE: had to create empty ensRiken table in mm2 in order for 
             hdb.c to believe the track exists.  I used this sql command:
     CREATE TABLE ensRiken (
	name varchar(255) not null,
	chrom varchar(255) not null,
	strand char(1) not null,
	txStart int(10) unsigned not null,
	txEnd int(10) unsigned not null,
	cdsStart int(10) unsigned not null,
	cdsEnd int(10) unsigned not null,
	exonCount int(10) unsigned not null,
	exonStarts longblob not null,
	exonEnds longblob not null
	);

LOAD SANGER22 GENES (todo)
      - cd ~/mm/bed
      - mkdir sanger22
      - cd sanger22
      - not sure where these files were downloaded from
      - grep -v Pseudogene Chr22*.genes.gff | hgSanger22 mm2 stdin Chr22*.cds.gff *.genes.dna *.cds.pep 0
          | ldHgGene mm2 sanger22pseudo stdin
         - Note: this creates sanger22extras, but doesn't currently create
           a correct sanger22 table, which are replaced in the next steps
      - sanger22-gff-doctor Chr22.3.1x.cds.gff Chr22.3.1x.genes.gff \
          | ldHgGene mm2 sanger22 stdin
      - sanger22-gff-doctor -pseudogenes Chr22.3.1x.cds.gff Chr22.3.1x.genes.gff \
          | ldHgGene mm2 sanger22pseudo stdin

      - hgPepPred mm2 generic sanger22pep *.pep

LOAD SANGER 20 GENES (todo)
     First download files from James Gilbert's email to ~/mm/bed/sanger20 and
     go to that directory while logged onto hgwdev.  Then:
        grep -v Pseudogene chr_20*.gtf | ldHgGene mm2 sanger20 stdin
	hgSanger20 mm2 *.gtf *.info


LOAD RNAGENES (todo)
      - login to hgwdev
      - cd ~kent/src/hg/lib
      - mm2 < rnaGene.sql
      - cd /cluster/store2/mm.2002.02/mm2/bed
      - mkdir rnaGene
      - cd rnaGene
      - download data from ftp.genetics.wustl.edu/pub/eddy/pickup/ncrna-oo27.gff.gz
      - gunzip *.gz
      - liftUp chrom.gff ../../jkStuff/liftAll.lft carry ncrna-oo27.gff
      - hgRnaGenes mm2 chrom.gff

LOAD EXOFISH (todo)
     - login to hgwdev
     - cd /cluster/store2/mm.2002.02/mm2/bed
     - mkdir exoFish
     - cd exoFish
     - mm2 < ~kent/src/hg/lib/exoFish.sql
     - Put email attatchment from Olivier Jaillon (ojaaillon@genoscope.cns.fr)
       into /cluster/store2/mm.2002.02/mm2/bed/exoFish/all_maping_ecore
     - awk -f filter.awk all_maping_ecore > exoFish.bed
     - hgLoadBed mm2 exoFish exoFish.bed

LOAD MOUSE SYNTENY (todo)
     - login to hgwdev.
     - cd ~/kent/src/hg/lib
     - mm2 < mouseSyn.sql
     - mkdir ~/mm/bed/mouseSyn
     - cd ~/mm/bed/mouseSyn
     - Put Deanna Church's (church@ncbi.nlm.nih.gov) email attatchment as
       mouseSyn.txt
     - awk -f format.awk *.txt > mouseSyn.bed
     - delete first line of mouseSyn.bed
     - Enter database with "mm2" command.
     - At mysql> prompt type in:
          load data local infile 'mouseSyn.bed' into table mouseSyn


LOAD GENIE (done 05/30/02)
     mkdir -p ~/mm2/bed/genieAlt
     cd ~/mm2/bed/genieAlt
     wget http://www.neomorphic.com/mgap/mgscv3/gtf/mgscv3.genie.gtf.tgz
     gunzip -c mgscv3.genie.gtf.tgz | tar xvf -
     ldHgGene mm2 genieAlt mgscv3.genie.gtf/chr*.gtf
     wget http://www.neomorphic.com/mgap/mgscv3/fa/mgscv3.aa.tgz
     gunzip -c mgscv3.aa.tgz | tar xvf -
     hgPepPred mm2 genie geniePep chr*.aa.fa

LOAD GENIE CLONE BOUNDS (done 6/3/02)
     mkdir -p ~/mm2/bed/genieBounds
     cd ~/mm2/bed/genieBounds
     wget http://www.neomorphic.com/mgap/mgscv3/cb.bed/mgscv3_cb.bed.tgz
     gunzip -c mgscv3_cb.bed.tgz | tar xvf -
     - Trim the track definition from each file (these are actually custom 
       track files):
     foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Un)
       tail +2 chr${c}_cb.bed > chr${c}_cb-fixed.bed
     end
     hgLoadBed mm2 genieBounds *-fixed.bed


LOAD JACKSON LABS QTL (DONE 03/13/03)
    mkdir ~/mm2/bed/jaxQTL2
    # Save the email attachment from Sridhar Ramachandran at Jackson Labs
    # (bed 8+, jaxQTL2 format).
    # Strip the column headers and load into the database.  
    tail +2 QTLBedFormat.txt > jaxQTL2.bed
    hgLoadBed -noBin -tab -sqlTable=$HOME/kent/src/hg/lib/jaxQTL2.sql \
      mm2 jaxQTL2 jaxQTL2.bed


LOAD SOFTBERRY GENES (todo)
     - ln -s /cluster/store2/mm.2002.02/mm2 ~/mm
     - cd ~/mm/bed
     - mkdir softberry
     - cd softberry
     - get ftp://www.softberry.com/pub/SC_MOU_NOV01/softb_mou_genes_nov01.tar.gz
     ldHgGene mm2 softberryGene chr*.gff
     hgPepPred mm2 softberry *.protein
     hgSoftberryHom mm2 *.protein

LOAD ACEMBLY (todo)
    - Get acembly*gene.gff from Jean and Danielle Thierry-Mieg and
      place in ~/mm/bed/acembly
    - Replace c_chr with chr in acembly*.gff
    - Replace G_t1_chr with chr and likewise
      G_t2_chr with chr, etc.
    - cd ~/mm/bed/acembly
    - # The step directly below is not necessary since the files were already lifted
      #  liftUp ./aceChrom.gff /cluster/store2/mm.2002.02/mm2/jkStuff/liftHs.lft warn acemblygenes*.gff
    - Use /cluster/store2/mm.2002.02/mm2/mattStuff/filterFiles.pl to prepend "chr" to the
    start of every line in the gene.gff files and to concat them into the aceChrom.gff
    gile. Read the script to see what it does. It's tiny and simple.
    - Concatenate all the protein.fasta files into a single acembly.pep file
    - Load into database as so:
        ldHgGene mm2 acembly aceChrom.gff
        hgPepPred mm2 generic acemblyPep acembly.pep

LOAD GENOMIC DUPES (todo)
o - Load genomic dupes
    ssh hgwdev
    cd ~/mm/bed
    mkdir genomicDups
    cd genomicDups
    wget http://codon/jab/web/takeoff/oo33_dups_for_kent.zip
    unzip *.zip
    awk -f filter.awk oo33_dups_for_kent > genomicDups.bed
    mysql -u hgcat -pbigSECRET mm2 < ~/src/hg/lib/genomicDups.sql
    hgLoadBed mm2 -oldTable genomicDups genomicDupes.bed

FAKING DATA FROM PREVIOUS VERSION
(This is just for until proper track arrives.  Rescues about
97% of data  Just an experiment, not really followed through on).

o - Rescuing STS track:
     - log onto hgwdev
     - mkdir ~/mm/rescue
     - cd !$
     - mkdir sts
     - cd sts
     - bedDown hg3 mapGenethon sts.fa sts.tab
     - echo ~/mm/sts.fa > fa.lst
     - pslOoJobs ~/mm ~/mm/rescue/sts/fa.lst ~/mm/rescue/sts g2g
     - log onto cc01
     - cc ~/mm/rescue/sts
     - split all.con into 3 parts and condor_submit each part
     - wait for assembly to finish
     - cd psl
     - mkdir all
     - ln ?/*.psl ??/*.psl *.psl all
     - pslSort dirs raw.psl temp all
     - pslReps raw.psl contig.psl /dev/null
     - rm raw.psl
     - liftUp chrom.psl ../../../jkStuff/liftAll.lft carry contig.psl
     - rm contig.psl
     - mv chrom.psl ../convert.psl


LOAD SLAM GENES (hg12)
     cd /cluster/store3/gs.13/build30/bed
     mkdir slam
     cd slam
     wget http://bio.math.berkeley.edu/slam/mouse/gff/UCSC/mmCDS.gff.gz
     wget http://bio.math.berkeley.edu/slam/mouse/gff/UCSC/mmCNS.gff.gz
     gunzip *
     ldHgGene -exon=CDS mm2 slam mmCDS.gff
     mv genePred.tab genePred.mm2

     awk '{print $1,$4,$5,$10,$12}' mmCNS.gff > mmCNS.bed
     sed -e 's/;//g' -e 's/"//g' mmCNS.bed > mmCNS.bed.2
     sort -n -k 5,5 mmCNS.bed.2 > mmCNS.bed.sort
     examine head and tail of sorted file for range of scores
     rm mmCNS.bed.sort
     size.pl < mmCNS.bed.2 > mmCNS.bed.2.size
     sort -n -k 2,2 mmCNS.bed.2.size > mmCNS.bed.2.size.sort
     examine head and tail of sorted file for range of sizes
     rm mmCNS.bed.2.size.sort
     expand.pl < mmCNS.bed.2 > mmCNS.bed.2.expand

SLAM (hg13)
     
     cd /cluster/store2/mm.2002.02/mm2/bed/slam
     mkdir hg13
     cd hg13

     wget http://baboon.math.berkeley.edu/~cdewey/slam/hs_31_Nov2002_mm_3_Feb2002/gff/mmCDS.gff.gz
     gunzip mmCDS.gff.gz
     mv mmCDS.gff mouseFromHumanCDS.gff
     ldHgGene -exon=CDS mm2 slamHuman

     wget http://baboon.math.berkeley.edu/~scawley/slam/hs_31_Nov2002_mm_3_Feb2002/gff/mmCNS.bed.gz
     gunzip mmCNS.bed.gz
     mv mmCNS.bed mouseFromHumanCNS.bed
     expand.pl < humanFromMouseCNS.bed > humanFromMouseCNS.bed.expand
     hgLoadBed -tab hg13 slamNonCodingMouse humanFromMouseCNS.bed.expand


REATING THE musHumL SAMPLE TRACK (a.k.a WIGGLE TRACK)
------------------------------------------------------
o - refer to the script at src/hg/sampleTracks/makeMm2Hg12.doc

####################################################################
## NIA Mouse Gene Index - (WORKING - 2004-01-07 - Hiram)
#	requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov
#	pick up data (available only this one time)
    ssh hgwdev
    mkdir /cluster/data/mm2/bed/NIAGene
    cd /cluster/data/mm2/bed/NIAGene
    wget --timestamping \
	http://lgsun.grc.nia.nih.gov/geneindex/blatNAP-genome.txt
    wget --timestamping \
	"http://lgsun.grc.nia.nih.gov/Supplemental-Information/NAP.fasta"
    #	This file seems to have an extra field at the end ?
    #	It is always a 1 except for two entries that do not have
    #	this extra field.
    awk '{for (i=1; i<21; ++i) { printf "%s\t",$i; } printf "%s\n",$21;}' \
	blatNAP-genome.txt > blatNAP-genome.psl
    hgLoadPsl mm2 -table=NIAGene blatNAP-genome.psl
# load of NIAGene did not go as planned: 114560 record(s), 0 row(s) skipped,
#	12 warning(s) loading psl.tab

    mkdir /gbdb/mm2/NIAGene
    ln -s /cluster/data/mm2/bed/NIAGene/NAP.fasta \
	/gbdb/mm2/NIAGene/NAP.fa
    hgLoadSeq mm2 /gbdb/mm2/NIAGene/NAP.fa
