# for emacs: -*- mode: sh; -*-

# http://www.hgsc.bcm.tmc.edu/project-species-p-Rhesus%20Macaque.hgsc?pageLocation=Rhesus%20Macaque

#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AANU00

# Macaca Mulatta  (rheMac2)
# Venter Institute final split merge assembly using washU and baylor

#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#
#  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
#  +-------------+---------------------------------+
#  | tableName   | type                            |
#  +-------------+---------------------------------+
#  | xenoRefGene | genePred xenoRefPep xenoRefMrna |
#  | nscanGene   | genePred nscanPep               |
#  +-------------+---------------------------------+


# DOWNLOAD SEQUENCE (DONE jan27  2006 Robert)
    ssh kkstore01
    mkdir /cluster/store2/rheMac2
    mkdir /cluster/store2/rheMac2/final
    cd /cluster/data
    ln -s /cluster/store2/rheMac2/final rheMac2
    cd /cluster/data/rheMac2
    mkdir downloads
    cd downloads


The macaque genome assembly (v. 1.0, Mmul051212) has been posted to the macaque genome ftp site:
ftp://macftp@ftp.macaquegenome.hgsc.bcm.tmc.edu/

The pw is analysis .


mkdir final
cd final
mkdir download
cd download

# A.  The assembly itself is represented in three groups, 1) the main assembly, 2) the Chr#_random and ChrU set, and 3) the ChrUr set (a late addition to the ChrU set).

#AGP and Fasta chromosome files for these three sets are:
#   1) Main Assembly - these are scaffolds, not chromosomes.
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1-edit4.trimNoContam.agp.bz2
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1_edit4.scf.fasta.bz2
#   2) Mapped to chromosomes but unplaced Chr#_random, and unmapped contigs ChrU
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.random.ctgs.agp.bz2 
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.random.scfs.agp.bz2 
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1_edit4.random.chrm.fasta.bz2
#   3) Late addition of 353 contigs to ChrU
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.ChrUr.chrm.fasta.bz2  
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.ChrUr.ctgs.agp.bz2     
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.ChrUr.scfs.agp.bz2 

# B.  The contigs fasta and quality files that go into these are in two sets, the bulk of the files (those in parts 1 and 2 of the assembly above) and the few that were added back at the last minute (part 3 above).  
#    1)
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/NewMergeSplit/v1.edit3.noContam.ctg.fasta.bz2
#    2)
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/NewMergeSplit/v1.edit3.noContam.ctg.qv.bz2
#    3) The 353 contigs that were added back at the last minute are included in the following files (other contigs in these files are not included in the assembly)
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/NewMergeSplit/v1.edit3.degen.noContam.fasta.bz2
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/NewMergeSplit/v1.edit3.degen.noContam.qv.bz2


# C.  The final assembly file in the VI internal assembly format is:
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.asm.bz2

#D.  The chromosome files are below, note, all gaps are listed as "fragment yes" in the agp files, although some are spanned by clones and others are not (making use of human synteny or macaque map information instead).  These files will be corrected to list the uncaptured gaps as "clone no", but the intervals and other information will not change, so use these for other information now.

wget -q ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.chrm.fasta.corr.bz2
wget -q ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.chrome.ctgs.agp.corr.bz2
wget -q ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.chrome.scfs.agp.corr.bz2

bunzip2 *.bz2
#wrap extra long lines in fasta
fold v1_edit4.random.chrm.fasta |sed -e 's/ChrUr/chrUr/' > rheMac2_random.fa
fold v1.edit4.chrm.fasta.corr |sed -e 's/ChrUr/chrUr/' >rheMac2.fa
fold v1_edit4.scf.fasta |sed -e 's/ChrUr/chrUr/' > rheMac2.scf.fa
mkdir ../Ur
fold v1.edit4.ChrUr.chrm.fasta |sed -e 's/ChrUr/chrUr/' > ../Ur/chrUr.fa
mv v1.edit4.chrome.ctgs.agp.corr v1.edit4.chrome.ctgs.corr.agp
mv v1.edit4.chrome.scfs.agp.corr v1.edit4.chrome.scfs.corr.agp

#*** NOTE FOR NEXT TIME *** use makeGenomeDb.pl instead of the following
#                           several sections:

# Check that IDs in .fa match IDs in .qual

  ssh kkstore01
  cd /cluster/data/rheMac2/downloads
  # foreach Bin0 contigs linearScaffolds reads
  grep ">" *.fasta > id.fa
  grep ">" *.qv > id.qual
  diff id.fa id.qual

# change chromosomes to lower case

sed -e 's/Chr/chr/' v1.edit4.chrome.ctgs.agp.final > v1.edit4.chrome.ctgs.final.fix2.agp
cat v1.edit4.chrome.scfs.agp.final |sed -e 's/Chr/chr/' > v1.edit4.final.fix2.agp
hgGoldGapGl -noGl rheMac2 /cluster/data/rheMac2/downloads/v1.edit4.final.fix2.agp

  /cluster/bin/i386/checkAgpAndFa v1.edit4.final.fix.agp rheMac2.fa > check.out   

# SPLIT UP chromosome and make 500k chunks for RM


  ssh kkstore02
  cd /cluster/data/rheMac2

  faSplit byname downloads/rheMac2.fa chrom
  for i in `awk '{print $1}' chrom.sizes` ; do echo $i ; mkdir -p ${i##chr} ; mv $i.fa ${i##chr} ; pushd ${i##chr} ; faSplit -lift=$i.lft size $i.fa 500000 split ; popd; done 
  # generate list
  find scaffolds -print > scaffolds.list.0
  grep fa scaffolds.list.0 > scaffolds.list

#### MAKE 2BIT NIB FILE 

  ssh kkstore01
  nice /cluster/bin/i386/faToTwoBit rheMac2.fa rheMac2.2bit
  nice /cluster/bin/i386/twoBitToFa rheMac2.2bit check.fa
  mv rheMac2.2bit ..
  # could also use cmp
  # note: size match
  faSize -detailed=on rheMac2.fa >mac.len 
  faSize -detailed=on check.fa >check.len 
  diff mac.len check.len
  wc -l *.len

  ssh hgwdev
  mkdir /gbdb/rheMac2
  ln -s /cluster/data/rheMac2/rheMac2.2bit /gbdb/rheMac2/rheMac2.2bit


#### CREATE DATABASE (DONE Jan 27 , 2006 Robert)

  ssh hgwdev
  hgsql hg17
  create database rheMac2;
  use rheMac2;
  create table grp (PRIMARY KEY(NAME)) select * from hg17.grp;
  # add rows to hgcentraltest on hgwbeta in dbDb and defaultDb
  # set orderKey past dog, before chimp

# LOAD GAP & GOLD TABLES FROM AGP 
    ssh hgwdev
    hgGoldGapGl -noGl rheMac2 /cluster/data/rheMac2/downloads/v1.edit4.chrome.scfs.final.fix.agp
# CREATE GAP TABLE  from v1.edit4.chrome.ctgs.final.fix.agp to get gaps

  ssh hgwdev
  cd kent/src/hg/lib
  # remove bin column for now
  hgsql rheMac2 < gap2.sql
  cd /cluster/data/rheMac2/downloads
  grep fragment v1.edit4.chrome.ctgs.final.fix.agp > fragment.txt
  grep clone v1.edit4.chrome.ctgs.final.fix.agp >> fragment.txt 
  hgsql rheMac2
  load data local infile 'fragment.txt' into table gap
  # create trackDb/monkey/rheMac2/gap.html

# CREATE CONTIG TABLE  (DONE Feb 15,2006 Robert)

  ssh hgwdev
  cd kent/src/hg/lib
  cp ~/kent/src/hg/lib/ctgPos.sql .
  # edit ctgPos.sql to increase size of primary key to 20
  # this doesn't include strand or contig start/end
  hgsql rheMac2 < ctgPos.sql
  cd /cluster/data/rheMac2/downloads
  grep -v clone v1.edit4.chrome.ctgs.final.fix.agp |grep -v fragment > Contig.out

  ssh kkstore02
  cd /cluster/data/rheMac2/downloads
  getContig.pl < Contig.out > ctgPos
  ssh hgwdev
  load data local infile 'ctgPos' into table ctgPos
  #convert to half open and fix bug in agp file
  hgsql rheMac2 -e "update ctgPos set chromStart = chromStart -1 "
  hgsql rheMac2 -e "update ctgPos set size = size -1 "
# CREATE CHROMINFO TABLE 
# an improvement here would be to have getMaxCoord output the 2bit file name
  ssh hgwdev
  cd /cluster/data/rheMac2
  # get coordinates from agp and put in database table
  getcoords.pl < downloads/v1.edit4.chrome.scfs.final.fix.agp > scaffolds.coords
  hgsql rheMac2 < /cluster/data/bosTau1/coords.sql
  load data local infile 'scaffolds.coords' into table scaffoldCoords
  faSize rheMac2.fa -detailed=on > chrom.sizes

  # calculate maximum coords; check that start coord is always 1
##  /cluster/home/heather/bin/i386/GetMaxCoord > getMaxCoord.rheMac2

   edit chromInfo.sql; allow fileName to be null
   cp ~baertsch/kent/src/hg/lib/chromInfo.sql .
   hgsql rheMac2 < chromInfo.sql
   load data local infile 'chrom.sizes' into table chromInfo
   update chromInfo set fileName = "/gbdb/rheMac2/rheMac2.2bit"



# REPEATMASKER 
# using the split scaffold fa files generated earlier

#*** NOTE FOR NEXT TIME *** use doRepeatMasker.pl instead.

# note the version of RepeatMasker in use; will want this for download README
  cat /cluster/bluearc/RepeatMasker/Libraries/version
# RM database version 20060120

# do a trial run
  ssh hgwdev
  cd /cluster/data/rheMac2/scaffolds/0/0/0
  /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec macaca Contig1000.fa
# configure
  cd /cluster/data/rheMac2
  mkdir jkStuff
  cp /cluster/data/anoGam1/jkStuff/RMAnopheles jkStuff/RMRhesus
  # change references anoGam1 --> rheMac2
  mkdir RMRun

  # /bin/csh makeJoblist-RM.csh

  # do the run  (DONE Jan 30, 2006 Robert)
  ssh pk
  cd /cluster/data/rheMac2/RMRun
  para create RMJobs.2
  para try
  para push

  # concatenate into one output file; took about 90 minutes
  ssh kkstore01
  cd /cluster/data/rheMac2
  mkdir repeats
  /bin/tcsh concatRM.csh
  cd repeats
  # use grep -v to get rid of all the headers
  # then, add back in an initial header
  # this is so hgLoadOut is happy
  grep chr repeats.all > repeats.clean
  cat repeats.header repeats.clean > repeats.final
  grep "There were no repetitive sequences " repeats.final > repeats.notfound
  grep -v "There were no repetitive sequences " repeats.final > repeat.out
  ssh hgwdev
  hgLoadOut rheMac2 repeat.out 2>errors 
  # Strange perc. fields:

Processing repeats.out
Strange perc. field -6.4 line 652513 of repeat.out
Strange perc. field -2236.0 line 749163 of repeat.out
Strange perc. field -2637.9 line 749163 of repeat.out
Strange perc. field -99.3 line 749163 of repeat.out
Strange perc. field -0.3 line 1386112 of repeat.out
Strange perc. field -0.2 line 2045133 of repeat.out
Strange perc. field -0.4 line 2429164 of repeat.out
Strange perc. field -4.8 line 2778536 of repeat.out
Strange perc. field -0.4 line 2912076 of repeat.out
Strange perc. field -5.5 line 3498109 of repeat.out
Strange perc. field -1.0 line 3642974 of repeat.out
Strange perc. field -2.7 line 3642974 of repeat.out
Strange perc. field -263.9 line 3711627 of repeat.out
Strange perc. field -187.0 line 3711627 of repeat.out
Strange perc. field -3.4 line 3795344 of repeat.out
Strange perc. field -0.1 line 3932239 of repeat.out
Strange perc. field -0.2 line 4092279 of repeat.out
Strange perc. field -0.1 line 4240026 of repeat.out
Strange perc. field -0.1 line 4440809 of repeat.out
Strange perc. field -7915.8 line 4506386 of repeat.out
Strange perc. field -0.1 line 4514494 of repeat.out
Strange perc. field -4361.2 line 4667281 of repeat.out
Strange perc. field -840.4 line 4667281 of repeat.out

Loading up table repeats_rmsk
note: 997 records dropped due to repStart > repEnd

  hgsql rheMac2
  rename table repeats_rmsk to rmsk

  # select count(*) from rmsk;

# SIMPLE REPEATS (DONE Jan 30, 2006 Robert)
# put the results throughout the scaffold 0/0/0 directories,
# same as RepeatMasker, to avoid too many files in the same directory
# *** BUG: FILTERING WAS NOT DONE --> OVERMASKING ***

#*** NOTE FOR NEXT TIME *** use doSimpleRepeat.pl instead.

  ssh kkstore01
  cd /cluster/data/rheMac2
  mkdir bed
  cd bed
  mkdir simpleRepeat
  cd simpleRepeat

  /bin/csh makeJoblist-trf.csh
  tcsh trf-run.csh > & ! trf.log &

  # concatenate into one output file; took about an hour
    cat chr*.bed > simpleRepeat.bed


  # load
  /cluster/bin/i386/hgLoadBed rheMac2 simpleRepeat simpleRepeat.bed \
    -sqlTable = /cluster/home/heather/kent/src/hg/lib/simpleRepeat.sql

#Reading trf.bed
#Loaded 628275 elements of size 16
#Sorted
#Saving bed.tab
#Loading rheMac2

  create index chrom_2 on simpleRepeat (chrom(16), chromStart);
  create index chrom_3 on simpleRepeat (chrom(16), chromEnd);


# CREATE MASKED FA USING REPEATMASKER AND FILTERED TRF FILES 

  ssh kkstore02
  cd /cluster/data/rheMac2
  /cluster/bin/i386/maskOutFa -soft downloads/rheMac2.fa repeats/repeat.out rheMac2.softmask.fa

WARNING: negative rEnd: -11 chr1:17040504-17040634 MLT2B4
WARNING: negative rEnd: -15 chr1:39400221-39400370 MLT2B4
WARNING: negative rEnd: -28 chr1:75866521-75866637 MLT2B4
WARNING: negative rEnd: -104 chr1:102866722-102866763 MLT2B4
WARNING: negative rEnd: -52 chr1:132930605-132930673 MLT2B4
WARNING: negative rEnd: -686 chr1:133186227-133186372 L1MCc
WARNING: negative rEnd: -203 chr1:133186415-133186749 L1MCc
WARNING: negative rEnd: -455 chr1:171379505-171379760 L1M3e
WARNING: negative rEnd: -17 chr1:216018086-216018189 L1MCb
WARNING: negative rEnd: -198 chr1:226920721-226920777 L1M4
WARNING: negative rEnd: -131 chr1:226921141-226921200 L1M4
WARNING: negative rEnd: -3 chr1:226921979-226922094 L1M4
WARNING: negative rEnd: -335 chr1:227071405-227071650 L1MCc
WARNING: negative rEnd: -278 chr2:16216076-16216253 L1P4b
WARNING: negative rEnd: -72 chr2:18621605-18622180 L1PBa
WARNING: negative rEnd: -285 chr2:24878971-24879214 L1MCc
WARNING: negative rEnd: -419 chr2:25206834-25207017 L1MCc
WARNING: negative rEnd: -183 chr2:25207033-25207203 L1MCc
WARNING: negative rEnd: -3 chr2:48529930-48529991 MER6
WARNING: negative rEnd: -426 chr2:80341345-80341639 L1M2c
WARNING: negative rEnd: -314 chr2:82400541-82400763 L1MDa
WARNING: negative rEnd: -1015 chr2:82731009-82731348 L1ME1
WARNING: negative rEnd: -282 chr2:82731554-82731973 L1ME1
WARNING: negative rEnd: -129 chr2:113802567-113802946 L1MCc
WARNING: negative rEnd: -528 chr2:129549508-129549693 L1M3e
WARNING: negative rEnd: -437 chr2:160075806-160076108 L1M2
WARNING: negative rEnd: -21 chr3:39033062-39033110 MER6
WARNING: negative rEnd: -271 chr3:58615645-58615852 L1MEb
WARNING: negative rEnd: -202 chr3:70911642-70911963 L1M5
WARNING: negative rEnd: -33 chr3:71817441-71818111 L1M3e
WARNING: negative rEnd: -316 chr3:112797698-112798023 L1PB3
WARNING: negative rEnd: -17 chr3:113476319-113476407 FRAM
WARNING: negative rEnd: -27 chr3:116802770-116802992 L1MB7
WARNING: negative rEnd: -4 chr3:126585635-126585722 FRAM
WARNING: negative rEnd: -632 chr3:183268961-183269045 L1M3b
WARNING: negative rEnd: -1132 chr4:16324742-16324819 L1MEb
WARNING: negative rEnd: -930 chr4:38188841-38189000 L1MEd
WARNING: negative rEnd: -448 chr4:122440460-122440750 L1M3e
WARNING: negative rEnd: -39 chr5:7580689-7581026 L1MEe
WARNING: negative rEnd: -1956 chr5:27414234-27414442 L1MB8
WARNING: negative rEnd: -1682 chr5:27414453-27414671 L1MB8
WARNING: negative rEnd: -513 chr5:66093155-66093311 L1M4b
WARNING: negative rEnd: -35 chr5:97364737-97364827 L1MCa
WARNING: negative rEnd: -1077 chr5:109355971-109356130 L1MEb
WARNING: negative rEnd: -20 chr5:109357351-109357727 L1MEb
WARNING: negative rEnd: -102 chr5:119072794-119072945 L1MB4
WARNING: negative rEnd: -9 chr5:144123391-144123455 L1M3de
WARNING: negative rEnd: -59 chr5:150681576-150681827 L1M4c
WARNING: negative rEnd: -461 chr5:170348753-170349060 L1M3e
WARNING: negative rEnd: -138 chr5:170614438-170615156 L1M1
WARNING: negative rEnd: -103 chr5:180911655-180912016 L1MEd
WARNING: negative rEnd: -38 chr5:181275921-181276086 L1M4
WARNING: negative rEnd: -219 chr6:326885-327048 L1MCc
WARNING: negative rEnd: -243 chr6:18237822-18238057 L1MC
WARNING: negative rEnd: -458 chr6:24563135-24563511 L1M4b
WARNING: negative rEnd: -14 chr6:27838867-27838984 MLT2B4
WARNING: negative rEnd: -92 chr6:33082868-33082962 L1M4
WARNING: negative rEnd: -505 chr6:60951240-60951389 L1PB3
WARNING: negative rEnd: -9 chr6:76795983-76796095 MLT2B4
WARNING: negative rEnd: -1039 chr6:153171051-153171255 L1MEb
WARNING: negative rEnd: -448 chr6:156038512-156038675 L1M4b
WARNING: negative rEnd: -1797 chr6:172500974-172501240 L1M4b
WARNING: negative rEnd: -717 chr7:6530435-6530561 L1M4b
WARNING: negative rEnd: -180 chr7:6530567-6530816 L1M4b
WARNING: negative rEnd: -395 chr7:17573119-17573355 L1M4b
WARNING: negative rEnd: -713 chr7:34841140-34841243 L1PA12
WARNING: negative rEnd: -917 chr7:36602478-36602794 L1MEd
WARNING: negative rEnd: -993 chr7:49724690-49725009 L1MEc
WARNING: negative rEnd: -519 chr7:64924143-64924287 L1PA15-16
WARNING: negative rEnd: -170 chr7:64924684-64924986 L1PA15-16
WARNING: negative rEnd: -1526 chr7:130257854-130258027 L1MDa
WARNING: negative rEnd: -78 chr8:18037533-18037781 L1M3e
WARNING: negative rEnd: -643 chr8:25868692-25868877 L1MCc
WARNING: negative rEnd: -5 chr8:37458013-37458057 FRAM
WARNING: negative rEnd: -420 chr8:40430422-40430587 L1MCc
WARNING: negative rEnd: -73 chr8:53713425-53713503 MLT2B4
WARNING: negative rEnd: -38 chr8:58045714-58046896 L1MEd
WARNING: negative rEnd: -46 chr8:70060490-70060930 L1M4
WARNING: negative rEnd: -3 chr8:71535410-71535460 FRAM
WARNING: negative rEnd: -781 chr8:71840470-71840637 L1MCc
WARNING: negative rEnd: -613 chr8:71841566-71841717 L1MCc
WARNING: negative rEnd: -17 chr9:17870182-17870240 FRAM
WARNING: negative rEnd: -1077 chr9:36754551-36754681 L1MEb
WARNING: negative rEnd: -354 chr9:36754801-36755003 L1MEb
WARNING: negative rEnd: -189 chr9:53706555-53706958 L1MCc
WARNING: negative rEnd: -28 chr9:55631220-55631257 MER6
WARNING: negative rEnd: -259 chr9:83699632-83700159 L1MA4A
WARNING: negative rEnd: -412 chr10:93305328-93305380 L1MDb
WARNING: negative rEnd: -290 chr10:93305684-93305806 L1MDb
WARNING: negative rEnd: -232 chr11:93725539-93725691 L1MCc
WARNING: negative rEnd: -3 chr11:102814963-102815011 FRAM
WARNING: negative rEnd: -614 chr12:24319914-24321380 L1M4b
WARNING: negative rEnd: -557 chr12:24323027-24323094 L1M4b
WARNING: negative rEnd: -259 chr12:100959333-100959494 L1M1
WARNING: negative rEnd: -529 chr12:102350218-102350466 L1M2
WARNING: negative rEnd: -654 chr13:53814747-53815048 L1M4b
WARNING: negative rEnd: -7 chr13:58060931-58061578 L1M4c
WARNING: negative rEnd: -1125 chr13:72166501-72166711 L1ME1
WARNING: negative rEnd: -6 chr13:72540001-72540050 FRAM
WARNING: negative rEnd: -397 chr13:82523197-82523664 L1M3e
WARNING: negative rEnd: -119 chr13:120528068-120528107 MLT2B3
WARNING: negative rEnd: -1934 chr14:27100172-27100323 L1MCc
WARNING: negative rEnd: -158 chr14:27101192-27101535 L1MCc
WARNING: negative rEnd: -281 chr14:60812899-60813046 L1MDa
WARNING: negative rEnd: -376 chr15:22274805-22275047 L1M4b
WARNING: negative rEnd: -669 chr15:23733251-23733307 L1M3e
WARNING: negative rEnd: -422 chr15:24246692-24246910 L1MCc
WARNING: negative rEnd: -290 chr15:24247095-24247225 L1MCc
WARNING: negative rEnd: -252 chr15:24247564-24247617 L1MCc
WARNING: negative rEnd: -11 chr15:24247667-24247964 L1MCc
WARNING: negative rEnd: -261 chr15:77304256-77304526 L1M4c
WARNING: negative rEnd: -121 chr15:80339141-80339347 L1M4b
WARNING: negative rEnd: -488 chr15:83773572-83773877 L1MEd
WARNING: negative rEnd: -101 chr15:83773914-83774074 L1MEd
WARNING: negative rEnd: -92 chr16:40096072-40096144 MLT2B4
WARNING: negative rEnd: -76 chr17:44749449-44749518 MLT2B4
WARNING: negative rEnd: -97 chr17:85835577-85836019 L1MCc
WARNING: negative rEnd: -112 chr18:12965293-12965685 L1MCc
WARNING: negative rEnd: -879 chr18:41292895-41293389 L1MEc
WARNING: negative rEnd: -322 chr18:41294544-41295033 L1MEc
WARNING: negative rEnd: -1137 chr18:59335523-59335630 L1MEb
WARNING: negative rEnd: -102 chr19:5934741-5934887 L1M4
WARNING: negative rEnd: -6 chr19:11007926-11008075 FRAM
WARNING: negative rEnd: -24 chr19:44501149-44501305 L1M3b
WARNING: negative rEnd: -145 chr20:45957350-45957401 L1MDa
WARNING: negative rEnd: -1106 chr20:74584188-74584318 L1ME1
WARNING: negative rEnd: -348 chr20:74584451-74584806 L1ME1
WARNING: negative rEnd: -711 chr20:80285832-80285915 L1MCc
WARNING: negative rEnd: -37 chrX:4743643-4743679 MER6
WARNING: negative rEnd: -1941 chrX:48147060-48147184 L1M4b
WARNING: negative rEnd: -99 chrX:54535186-54535349 L1M4
WARNING: negative rEnd: -606 chrX:80315181-80315657 L1P4b
WARNING: negative rEnd: -65 chrX:97720783-97720963 L1M4
WARNING: negative rEnd: -24 chrX:120523759-120523982 L1ME1
WARNING: negative rEnd: -13 chrX:121193467-121193615 MLT2B4
WARNING: negative rEnd: -939 chrX:153527918-153528288 L1M4

  # matches select count(*) from rmsk where repEnd < 0;
+----------+
| count(*) |
+----------+
|      131 |
+----------+

  # DOH -- THIS MASKS TOO MUCH!  SHOULD HAVE USED FILTERED TRF!
  /cluster/bin/i386/maskOutFa -softAdd rheMac2.softmask.fa bed/simpleRepeat/simpleRepeat.bed rheMac2.softmask2.fa

  # hard masking (Ns instead of lower case) for download files
  # split for use by genscan
  /cluster/bin/i386/maskOutFa rheMac2.softmask2.fa hard rheMac2.hardmask.fa
  mkdir hardmask-split
  /cluster/bin/i386/faSplit about rheMac2.hardmask.fa 2000000 hardmask-split

# DOWNLOAD FILES
# REMAKE SOME BIGZIPS DOWNLOADS IN SAME FORMAT AS FOR OTHER ASSEMBLIES 
# WITH ONE FILE PER CHROMOSOME (DONE, 2006-06-12, hartera)

#*** NOTE FOR NEXT TIME *** use makeDownloads.pl instead.

  ssh hgwdev
  cd /cluster/data/rheMac2
  cp rheMac2.softmask.fa /usr/local/apache/htdocs/goldenPath/rheMac2/bigZips
  cp rheMac2.softmask2.fa /usr/local/apache/htdocs/goldenPath/rheMac2/bigZips
  cp rheMac2.hardmask.fa /usr/local/apache/htdocs/goldenPath/rheMac2/bigZips

  cd /usr/local/apache/htdocs/goldenPath/rheMac2/bigZips
  nice gzip rheMac2.softmask.fa
  nice gzip rheMac2.softmask2.fa
  nice gzip rheMac2.hardmask.fa
  # Remake some of the bigZips downlods in the same format as for 
  # other assemblies with one file per chrom (2006-06-12, hartera)
  ssh kkstore01
  cd /cluster/data/rheMac2
  # use rheMac2.softmask2.fa as they contain masking from RepeatMasker
  # and from trf.
  cat << '_EOF_' > jkStuff/zipAll.csh
mkdir -p bigZips
# RepeatMasker files
tar cvzf bigZips/chromOut.tar.gz ?{,?}/chr*.fa.out
tar cvzf bigZips/chromTrf.tar.gz bed/simpleRepeat/chr*.bed
# get soft masked and hard masked sequences, one per chrom
mkdir softMask hardMask
foreach c (`awk '{print$1}' /cluster/data/rheMac2/chrom.sizes`)
   faOneRecord /cluster/data/rheMac2/rheMac2.softmask2.fa \
        $c > softMask/${c}.fa
   faOneRecord /cluster/data/rheMac2/rheMac2.hardmask.fa \
        $c > hardMask/${c}.fa.masked
end
tar cvzf bigZips/chromFa.tar.gz softMask/chr*.fa
tar cvzf bigZips/chromFaMasked.tar.gz hardMask/chr*.fa.masked
'_EOF_'
   # << this line makes emacs coloring happy
   chmod +x jkStuff/zipAll.csh
   csh -ef ./jkStuff/zipAll.csh >& zipAll.log &
   # Took about 50 minutes.
   rm repeats.all.gz  rheMac2.hardmask.fa.gz  rheMac2.softmask.fa.gz \
      rheMac2.chrom.fa.gz
   md5sum *.gz > md5sum.txt
   # link the *.gz and *.txt files to hgwdev:/usr/local/apache/...
   ssh hgwdev
   set gp = /usr/local/apache/htdocs/goldenPath/rheMac2
   mkdir -p $gp/bigZips
   rm repeats.all.gz  rheMac2.hardmask.fa.gz  rheMac2.softmask.fa.gz \
      rheMac2.chrom.fa.gz md5sum.txt
   ln -s /cluster/data/rheMac2/bigZips/{chrom*.gz,*.txt} $gp/bigZips
   # Update README.txt to give correct file names.

# REGENERATE 2BIT NIB (DONE Jan 30, 2006 Robert)
  ssh kkstore02
  cd /cluster/data/rheMac2
  /cluster/bin/i386/faToTwoBit rheMac2.softmask2.fa rheMac2.softmask.2bit
  /cluster/bin/i386/twoBitToFa rheMac2.softmask.2bit check.softmask.fa
  diff -q rheMac2.softmask2.fa check.softmask.fa
  mv rheMac2.2bit rheMac2.2bit.unmasked
  mv rheMac2.softmask.2bit rheMac2.2bit
  # note: size match

# GENERATE 2bit for blastz 
  mkdir -p /san/sanvol1/scratch/rheMac2/ 
  cd /cluster/data/rheMac2
  cp rheMac2.2bit /san/sanvol1/scratch/rheMac2/ -p


ssh kkstore02
cd /cluster/data/hg17/bed/blastz.rheMac2
# edit doBlastzChainNet.pl to add lines for hg17 glitch
# in function getDbFromPath
#  if ($db eq "hg") {
#      $db = "hg17";
#  }
#doBlastzChainNet.pl DEF -blastzOutRoot /san/sanvol1/scratch/rheMac2/rheMacOut/ >& do.log &

############################################################################
##  BLASTZ swap from mm8 alignments (DONE - 2006-02-18 - Hiram)
    ssh pk
    cd /cluster/data/mm8/bed/blastz.rheMac2.2006-02-17
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
        `pwd`/DEF > swap.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
        -swap -continue=net `pwd`/DEF > swap.net.out 2>&1 &
    #   failed during a san hiccup,  finish that off, then:
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
        -swap -continue=load `pwd`/DEF > swap.load.out 2>&1 &

    time nice -n +19 featureBits rheMac2 chainMm8Link
    #   877906099 bases of 2646704109 (33.170%) in intersection

    #	This was done before the loadUp script was improved to load
    #	empty tables on chroms that have no chains, so sometime later:
    ssh hgwdev
    hgLoadChain rheMac2 chrUr_chainMm8 /dev/null
    #	Loading 0 chains into rheMac2.chrUr_chainMm8
    #	2006-04-24

############################################################################
# QUALITY SCORES (Done March 9, 2006 Robert)
    ssh kkstore01
    wget -q ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu:21/Assemblies/VI/FinalMergeSplit/v1.edit4.chrm.qv.bz2
    cd /cluster/data/rheMac2/downloads/
    bunzip2 v1.edit4.chrm.qv.bz2 &
    bunzip2 v1.edit4.chrUr.bz2 &
    nice fold --spaces v1.edit4.chrm.qv | sed -e 's/Chr/chr/' > rheMac2.chrm.qual.qv 
    nice fold --spaces v1.edit4.chrUr.qv | sed -e 's/Chr/chr/' > rheMac2.chrUr.qv 
    cat rheMac2.chrm.qual.qv rheMac2.chrUr.qv > rheMac2.qual.qv
    qaToQac rheMac2.qual.qv temp.qac 
    gzip rheMac2.qual.qv
    qacToWig -fixed temp.qac rheMac2.wig 
    rm temp.qac
    wigEncode rheMac2.wig rheMac2.qual.wig rheMac2.qual.wib 
    mv all.qv rheMac2.qual.qv
    qaToQac rheMac2.qual.qv temp.qac
    qacToWig -fixed temp.qac rheMac2.wig
    wigEncode rheMac2.wig rheMac2.qual.wig rheMac2.qual.wib
    ssh hgwdev
    hgLoadWiggle rheMac2 quality rheMac2.qual.wig
    cp -p rheMac2.qual.qv.gz /usr/local/apache/htdocs/rheMac2/bigZips


# RUN DATEREPEATS TO GET LINEAGE-SPECIFIC (DONE 3/22/06 angie)
    ssh kkstore01
    cd /cluster/data/rheMac2

    # First make per-chrom .fa.out files as should have been done:
    tail +4 repeats/repeat.out \
    | perl -we '$prevChr = ""; \
                while (<>) { \
                  @w = split; $chr = $w[4]; $c = $chr; $c =~ s/^chr//; \
                  if ($chr ne $prevChr) { \
                    print "creating $c/$chr.fa.out\n"; \
                    system("cat repeats/repeats.header > $c/$chr.fa.out"); \
                    close(OUT);  open(OUT, ">>$c/$chr.fa.out") || die; \
                  } \
                  print OUT; \
                  $prevChr = $chr; \
                }'

    # Run DateRepeats to get outputs on rodent, non-rodent mammal:
    mkdir linSpecRep
    cd linSpecRep
    ln -s ../*/chr*.fa.out .
    foreach f (*.fa.out)
      echo $f:r:r
      /cluster/bluearc/RepeatMasker/DateRepeats $f -query human \
        -comp mouse -comp dog
    end
    mkdir /cluster/bluearc/rheMac2/linSpecRep
    mkdir /cluster/bluearc/rheMac2/linSpecRep/notIn{Rodent,NonRodentMammal}
    foreach f (*.out_mus*)
      set chr = $f:r:r
      echo $chr
      /cluster/bin/scripts/extractRepeats 1 $f \
       > /cluster/bluearc/rheMac2/linSpecRep/notInRodent/$chr.out.spec
      /cluster/bin/scripts/extractRepeats 2 $f \
       > /cluster/bluearc/rheMac2/linSpecRep/notInNonRodentMammal/$chr.out.spec
    end


# BLASTZ SWAP FOR HUMAN (hg17) (DONE, 2006-03-22, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS
    # Documented in makeHg17.doc in the section:
    # SWAP CHAIN AND NET ALIGNMENTS OVER TO RHESUS (rheMac2)
    # alignments are in: /cluster/data/rheMac2/bed/blastz.hg17.swap


# SWAP CHAINS/NET RN4 (DONE 3/23/06 angie)
    ssh kkstore01
    mkdir /cluster/data/rheMac2/bed/blastz.rn4.swap
    cd /cluster/data/rheMac2/bed/blastz.rn4.swap
    doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.rheMac2/DEF \
      >& do.log & tail -f do.log
    ln -s blastz.rn4.swap /cluster/data/rheMac2/bed/blastz.rn4


###########################################################################
# 5 way multiz WITH RheMac2 (robert, 5/3/06) -- MM8, RN4, CANFAM2, HG18
# rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd)

    # first build 5-way multiz alignment from syntenic nets (helps reduce
    # false positive predictions due to paralogous alignments)
    # (prepare mafNet files from syntenic nets and copy to
    # /san/sanvol1/scratch/rheMac2/mafNetSyn; do this for mm8, rn4, canFam2, rheMac2

ssh kkstore02
cd /cluster/data/rheMac2/bed
mkdir exoniphy
cd exoniphy
    cat > makeSynMaf.sh << 'EOF'
#!/bin/bash
fromdb=$1
todb=$2
DIR=$3
outdir=$4/$2

base=`pwd`
cut -f1 /cluster/data/$fromdb/chrom.sizes > chrom.lst
echo making syntenic mafs for $todb in $DIR from $base
cd $DIR
cd axtChain
nice netFilter -syn $fromdb.$todb.net.gz > $fromdb.$todb.syn.net
mkdir -p netSynSplit
echo splitting nets
nice netSplit $fromdb.$todb.syn.net netSynSplit
mkdir -p chain
echo splitting chains $fromdb.$todb.all.chain.gz
nice chainSplit chain $fromdb.$todb.all.chain.gz
echo making mafs
mkdir -p $outdir
for i in `cat $base/chrom.lst` ; do echo "netToAxt netSynSplit/$i.net chain/$i.chain /cluster/data/$fromdb/nib /cluster/data/$todb/nib stdout | axtToMaf stdin -tPrefix=$fromdb. -qPrefix=$todb. /cluster/data/$fromdb/chrom.sizes /cluster/data/$todb/chrom.sizes $outdir/$i.maf " ; nice netToAxt netSynSplit/$i.net chain/$i.chain /cluster/data/$fromdb/nib /cluster/data/$todb/nib stdout | axtToMaf stdin -tPrefix=$fromdb. -qPrefix=$todb. /cluster/data/$fromdb/chrom.sizes /cluster/data/$todb/chrom.sizes $outdir/$i.maf ; done
echo removing temporary files
rm -rf axtChain/chain
rm -rf axtChain/netSynSplit/
# makes syntenic net from net 
#   with $1 as reference and $2 as query 
#   uses blastz alignment in $3 then converts to maf written to $4
# sample usage
# makeSynMaf.sh rheMac2 canFam2 /cluster/data/rheMac2/bed/blastz.canFam2 /san/sanvol1/scratch/rheMac2/mafNetSyn
'EOF'
# << for emacs

makeSynMaf.sh rheMac2 canFam2 /cluster/data/rheMac2/bed/blastz.canFam2 /san/sanvol1/scratch/rheMac2/mafNetSyn
makeSynMaf.sh rheMac2 hg18 /cluster/data/rheMac2/bed/blastz.hg18 /san/sanvol1/scratch/rheMac2/mafNetSyn
makeSynMaf.sh rheMac2 mm8 /cluster/data/rheMac2/bed/blastz.mm8 /san/sanvol1/scratch/rheMac2/mafNetSyn
makeSynMaf.sh rheMac2 rn4 /cluster/data/rheMac2/bed/blastz.rn4 /san/sanvol1/scratch/rheMac2/mafNetSyn

    # make output dir and run dir
    ssh pk
    mkdir -p /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2
    cd /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2

    mkdir -p mafSyn runSyn
    cd runSyn

    # create scripts to run multiz on cluster
cat > autoMultiz.csh << 'EOF'
#!/bin/csh -ef

    set path=(/cluster/bin/penn/multiz.v11.x86_64/ $path )
    set db = rheMac2
    set c = $1
    set maf = $2
    set run = /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/
    set tmp = /scratch/tmp/$db/multiz.syn.$c
    set pairs = /san/sanvol1/scratch/$db/mafNetSyn
    rm -fr $tmp
    mkdir -p $tmp
    cp ../{tree.nh,species.lst} $tmp
    pushd $tmp
    foreach s (`cat species.lst`)
        set in = $pairs/$s/$c.maf
        set out = $db.$s.sing.maf
        if ($s == rheMac2) then
            continue
        endif
        if (-e $in.gz) then
            zcat $in.gz > $out
        else if (-e $in) then
            cp $in $out
        else
            echo "##maf version=1 scoring=autoMZ" > $out
        endif
    end
    set path = ($run $path); rehash
    $run/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
    popd
    cp $tmp/$c.maf $maf
    rm -fr $tmp
'EOF'
# << for emacs
    chmod +x autoMultiz.csh

cat > gsub << 'EOF'
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/rheMac2/bed/multiz.rheMac2Hg18Mm8Rn4CanFam2/mafSyn/$(root1).maf}
#ENDLOOP
'EOF'
# << for emacs
    cut -f 1 /cluster/data/rheMac2/chrom.sizes > chrom.lst
    set path = (/parasol/bin $path);rehash
    gensub2 chrom.lst single gsub jobList
    para create jobList
        # 46 jobs
    para try; para check
    para push

    # Load into database
    ssh hgwdev
    cd /cluster/data/rheMac2/bed/multiz5waySyn/mafSyn
    mkdir -p /gbdb/rheMac2/multiz5waySyn/mafSyn
    ln -s /cluster/data/rheMac2/bed/multiz5waySyn/mafSyn/*.maf \
        /gbdb/rheMac2/multiz5waySyn/maf
cat > loadMaf.csh << 'EOF'
    time hgLoadMaf -pathPrefix=/gbdb/rheMac2/multiz5waySyn/maf rheMac2 multiz5waySyn
    cat *.maf | \
        nice hgLoadMafSummary rheMac2 -minSize=30000 -mergeGap=1500 -maxSize=200000  multiz5waySummary stdin
'EOF'
#<< happy emacs

    # Dropped unused indexes (2006-05-09 kate)
    # NOTE: this is not required in the future, as the loader
    # has been fixed to not generate these indexes
    hgsql rheMac2 -e "alter table multiz5waySummary drop index chrom_2"
    hgsql rheMac2 -e "alter table multiz5waySummary drop index chrom_3"

    # expect lengthy load time for this -- a few hours ?
    csh loadMaf.csh >&! loadMaf.log &
#Loading multiz5waySyn into database
#Loaded 6207706 mafs in 21 files from /gbdb/rheMac2/multiz5waySyn/maf
#Advisory lock created
#Advisory lock has been released
#455.360u 156.210s 13:51.54 73.5%        0+0k 0+0io 237pf+0w
#Indexing and tabulating stdin
#Created 795772 summary blocks from 15566976 components and 6207706 mafs from stdin
#Loading into rheMac2 table multiz5waySummary...
#Loading completeAdvisory lock has been released

        #drop index to improve performance
    hgsql rheMac2 -e " alter table multiz5waySummary drop index chrom_3"

    # build chromosome-by-chromosome SS files

    cd /cluster/data/rheMac2/bed/multiz.rheMac2Hg18Mm8Rn4CanFam2
    mkdir run-ss-syn
    cd run-ss-syn
    mkdir -p /san/sanvol1/scratch/rheMac2/multiz.rheMac2Hg18Mm8Rn4CanFam2/ssSyn
    cat > makeSS.csh << 'EOF'
#!/bin/csh -fe
set c = $1
/cluster/bin/phast/msa_view -i MAF -o SS /cluster/data/rheMac2/bed/multiz.rheMac2Hg18Mm8Rn4CanFam2/mafSyn/$c.maf --refseq /cluster/bluearc/rheMac2/chrom/$c.fa | gzip -c >  /san/sanvol1/scratch/rheMac2/multiz.rheMac2Hg18Mm8Rn4CanFam2/ssSyn/$c.ss.gz
'EOF'
# << for emacs
    chmod +x makeSS.csh

    foreach chr (`cut -f 1 /cluster/data/rheMac2/chrom.sizes`)
	echo "makeSS.csh $chr" >> jobList.ss
    end

    para create jobList.ss
        # 20 jobs
    para try; para check
    para push

    # gzip mafs for downloads area
    ssh kkstore01
    cd cluster/data/hg17/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/mafDownloads
cat > downloads.csh << 'EOF'
    date
    foreach f (../mafSyn/chr*.maf)
	set c = $f:t:r
        echo $c
	nice gzip -c $f > $c.maf.gz
    end
    md5sum *.gz > md5sum.txt
    date
'EOF'
    time csh downloads.csh >&! downloads.log
        # ~2 hours
    ssh hgwdev
    set dir = /usr/local/apache/htdocs/goldenPath/rheMac2/multiz5waySyn
    mkdir $dir
    ln -s /cluster/data/rheMac2/bed/multiz5waySyn/mafDownloads/{*.gz,md5sum.txt} $dir
    cp README.txt $dir

    #make multi5wayFrames table for browser track
    ssh kkstore01
    cd /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/
    set sanDir = /san/sanvol1/scratch/rheMac2/multiz5waySyn/frames
    mkdir -p $sanDir/maf
    cp -rp maf/* $sanDir/maf
    mkdir frames
    cd frames
    cp /cluster/data/mm7/bed/multiz17wayFrames/mkMafFrames .
    cp /cluster/data/mm7/bed/multiz17wayFrames/Makefile .
    #edit Makefile to correct species names and set and sanDir

    ssh hgwdev
    cd /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/frames
    make getGenes >&! getGenes.log &
        # ~1 minute
    make getFrames >&! getFrames.log &
    # NOTE: if jobs get hung up (e.g. running for hours, when
    #   they should run for minutes, do 'para stop' so that
    #   the 'para make' can restart the job
    make loadDb >&! loadDb.log &


#trackDb entry
#track multiz5waySyn
#shortLabel Conservation 
#longLabel Human/Mouse/Rat/Dog Multiz Alignment & Conservation using syntenic alignments
#group compGeno
#priority 104.1
#visibility pack
#color 0, 10, 100
#altColor 0,90,10
#type wigMaf 0.0 1.0
#maxHeightPixels 100:40:11
#wiggle phastCons5way
#pairwiseHeight 12
#spanList 1
#yLineOnOff Off
#autoScale Off
#summary multiz5waySummary
#frames multiz5wayFrames
#irows on
#speciesGroups mammal 
#sGroup_mammal hg18 mm8 rn4 canFam2 
#speciesDefaultOff 

    # request push to hgwdev

    ###
    # rebuild frames to get bug fix, using 1-pass maf methodology
    # (2006-06-09 markd)
    ssh kkstore01
    cd /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/frames
    mv mafFrames/ mafFrames.old
    nice tcsh # easy way to get process niced
    (cat  ../mafSyn/*.maf | time genePredToMafFrames rheMac2 stdin stdout canFam2 genes/canFam2.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz rheMac2 genes/rheMac2.gp.gz rn4 genes/rn4.gp.gz | gzip >multiz5wayFrames.mafFrames.gz)>&log
    ssh hgwdev
    cd /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/frames
    hgLoadMafFrames rheMac2 multiz5wayFrames multiz5wayFrames.mafFrames.gz >&log
    

###############################################################
# PHASTCONS CONSERVATION (NOT DONE yet, 2006-05-06 Robert)
# coverage is only 3% and needs more tweaking
# These parameters are:
# --rho
# --expected-length
# --target-coverage
# Also, instead of generating cons and noncons tree models,
# we use a single, pre-existing tree model -- Elliot Margulies' model 
# from the (37-way) ENCODE alignments.

    # NOTE: reusing cluster-friendly chrom fasta files created earlier

    ssh kkstore01    
    mkdir /cluster/bluearc/rheMac2/chrom
    cd /cluster/data/rheMac2
    foreach f (`cat chrom.lst`)
    echo $f
    cp $f/*.fa /cluster/bluearc/rheMac2/chrom
    end

    # Split chromosome MAF's into windows and use to generate
    # "sufficient statistics" (ss) files for phastCons input
    # NOTE: as the SAN fs has lotsa space, we're leaving these
    # big (temp) files unzipped, to save time during phastCons run.
    # Note also the larger chunk sizes from previous runs -- this
    # reduces run-time on the split, slows down the actual phastCons
    # enough so jobs don't crash (jobs are very quick, just a minute
    # or so), and according to Adam, will produce better results.
    # The previous small chunks were probably required by
    # the phyloFit step, which we are no longer using for the
    # human alignments.
    ssh pk
    mkdir /cluster/data/rheMac2/bed/multiz5waySyn/cons
    cd /cluster/data/rheMac2/bed/multiz5waySyn/cons
    cp /cluster/store5/gs.18/build35/bed/multiz17way.2005-12-20/cons/elliotsEncode.mod .
    # edit, change to hg18,  mm8, and rn4.
    mkdir run.split
    cd run.split
    set WINDOWS = /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons/ss
    rm -fr $WINDOWS
    mkdir -p $WINDOWS

    cat << 'EOF' > doSplit.csh
#!/bin/csh -ef
    set MAFS = /cluster/data/rheMac2/bed/multiz5waySyn/mafSyn
    set WINDOWS = /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons/ss
    cd $WINDOWS
    set c = $1
    echo $c
    rm -fr $c
    mkdir $c
    /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
        -M /cluster/bluearc/rheMac2/chrom/$c.fa \
        -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
    echo "Done" >> $c.done
'EOF'
# << happy emacs
    chmod +x doSplit.csh

rm -f jobList
foreach f (../../mafSyn/*.maf) 
set c = $f:t:r
echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList
end
    
    para create jobList
        # 49 jobs
    para try
    para check
    para push

   # estimate model parameters
    # estimate avg. cross-species avg. GC content from chr1 maf's

    ssh kolossus
    set path = ($path /cluster/bin/phast); rehash
    cd /cluster/data/rheMac2/bed/multiz5waySyn/cons
    msa_view --aggregate rheMac2,hg18,rn4,mm8,canFam2 \
        -i MAF \
        --summary-only /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/mafSyn/chr1.maf \
           > maf_summary.txt
    awk '$1 == "[aggregate]" {printf "%0.3f\n", $3 + $4}' maf_summary.txt
    # 0.424
   
    # Be sure to substitute in the right G+C content. Also, notice the
    # target coverage of 0.17.  We actually want 5% coverage here but
    # the final (posterior) coverage is only indirectly related to the
    # expected (prior) coverage.  One thing to consider is that we
    # only have about 40% alignment coverage (excluding chimp, which
    # doesn't help us much in identifying conserved regions).  As far
    # as phastCons is concerned, we want to aim for about 0.05 / 0.4 =
    # 0.125 coverage.  In this case, though, --target-coverage
    # 0.125 resulted in only about 4.1% coverage.  I had to iterate
    # a couple of times (using only chromosome 1) to find a value that
    # got me close to the target of 5%

   ssh pk
   mkdir -p /cluster/data/rheMac2/bed/multiz5waySyn/cons/run.elements
   cd /cluster/data/rheMac2/bed/multiz5waySyn/cons/run.elements
   mkdir -p TREES
   mkdir -p LOG
   ls /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons/ss/chr*/*ss > all.windows
   for f in `cat all.windows` ; do         root=`basename $f .ss.gz` ;        echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobs.lst ;    done
   para create jobs.lst
   para push

    # check tree model on 5MB chunk, using params recommended by Adam,
    # he ok'ed the results -- not necessary for next human run
    ssh kolossus
    cd /cluster/data/rheMac2/bed/multiz5waySyn/cons
    /cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \
        --tree "`cat ../tree-commas.nh`" \
        /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons/ss/chr7/chr7.110000001-120000000.ss \
        -o phyloFit.tree

    # Run phastCons
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    # cd ..
    mkdir run.cons
    cd run.cons
    cat > doPhast.csh << 'EOF'
#!/bin/csh -fe
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set tmp = /scratch/tmp/$f
mkdir -p $tmp
set san = /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons
cp -p $san/ss/$c/$f.ss ../elliotsEncode.mod $tmp
pushd $tmp > /dev/null
/cluster/bin/phast/$MACHTYPE/phastCons $f.ss elliotsEncode.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--not-informative panTro1,rheMac2 \
--seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
popd > /dev/null
mkdir -p $san/pp/$c $san/bed/$c
sleep 1
mv $tmp/$f.pp $san/pp/$c
mv $tmp/$f.bed $san/bed/$c
rm -fr $tmp
'EOF'
    # emacs happy
    chmod a+x doPhast.csh

    #	root1 == chrom name, file1 == ss file name without .ss suffix
    # Create gsub file
    cat > template << 'EOF'
#LOOP
doPhast.csh $(root1) $(file1) 14 .008 .28
#ENDLOOP
'EOF'
    #	happy emacs

    # Create parasol batch and run it
    pushd /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons
    # mkdir /cluster/data/rheMac2/bed/multiz5waySyn/cons/run.cons
    ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
        /cluster/data/rheMac2/bed/multiz5waySyn/cons/run.cons/in.list

    ssh pk
    cd /cluster/store11/gs.19/build36/bed/multiz5waySyn/cons/run.cons

    gensub2 in.list single template jobList
    para create jobList
        # 295 jobs
    para try
    para check
    para push
#Completed: 295 of 295 jobs
#CPU time in finished jobs:       9160s     152.67m     2.54h    0.11d  0.000 y
#IO & Wait Time:                  2969s      49.48m     0.82h    0.03d  0.000 y
#Average job time:                  41s       0.69m     0.01h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:              78s       1.30m     0.02h    0.00d
#Submission to last job:           276s       4.60m     0.08h    0.00d

    # create Most Conserved track
    ssh kolossus
    cd /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons
    #	The sed's and the sort get the file names in chrom,start order
    # (Hiram tricks -- split into columns on [.-/] with 
    #    identifying x,y,z, to allow column sorting and
    #    restoring the filename.  Warning: the sort column
    # will depend on how deep you are in the dir
    find ./bed -name "chr*.bed" | \
        sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
	sort -k7,7 -k9,9n | \
	sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
	xargs cat | \
	awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
	/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    #	~ 1 minute
    cp -p mostConserved.bed /cluster/data/rheMac2/bed/multiz5waySyn/cons

    # load into database
    ssh hgwdev
    cd /cluster/data/rheMac2/bed/multiz5waySyn/cons
    hgLoadBed -strict rheMac2 phastConsElements5way mostConserved.bed
     #Loaded 602351 elements of size 5
    # compare with previous tracks
    hgsql rheMac2 -e "select count(*) from phastConsElements5way"
        # 2260575
    # hgsql hg18 -e "select count(*) from phastConsElements"
    # hg18 does not have phastConsElements table
        # 1601903
    # Try for 5% overall cov, and 70% CDS cov (used elen=13, tcov=.007, rho=.27)
    featureBits hg18 -enrichment refGene:cds phastConsElements17way
    # refGene:cds 1.072%, phastConsElements17way 5.510%, both 0.759%, cover 70.83%, enrich 12.86x
    featureBits hg17 -enrichment refGene:cds phastConsElements17way
    # refGene:cds 1.064%, phastConsElements17way 5.104%, both 0.748%, cover 70.29%, enrich 13.77x

    # compare with previous tracks
    featureBits hg18 -enrichment refGene:cds phastConsElements10way
        # refGene:cds 1.062%, phastConsElements10way 5.003%, both 0.734%, cover 69.18%, enrich 13.83x
    featureBits hg18 -enrichment refGene:cds phastConsElements
        # refGene:cds 1.062%, phastConsElements 4.810%, both 0.771%, cover 72.65%, enrich 15.11x

    # Create merged posterier probability file and wiggle track data files
    #	pk is currently closer to the san than any other machine
    ssh pk
    cd /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons/
    # sort by chromName, chromStart so that items are in numerical order 
    #  for wigEncode
    find ./pp -name "chr*.pp" | \
        sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
	sort -k7,7 -k9,9n | \
	sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
	xargs cat | \
        nice wigEncode stdin phastCons5way.wig phastCons5way.wib
    # about 23 minutes for above

    cp -p phastCons17way.wi? /cluster/data/hg18/bed/multiz17way/cons

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz17way/cons
    ln -s `pwd`/phastCons17way.wib /gbdb/hg18/multiz17way/phastCons17way.wib
    hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz17way hg18 \
        phastCons17way phastCons17way.wig
    #  ~ 3 minute load

    # Downloads  (2006-02-22 Fan)
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz17way
    mkdir mafDownloads
    cd mafDownloads
    # upstream mafs (mafFrags takes a while)
cat > mafFrags.csh << 'EOF'
    date
    foreach i (1000 2000 5000)
        echo "making upstream$i.maf"
        nice featureBits hg18 refGene:upstream:$i -fa=/dev/null -bed=up.bad
        awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
        rm up.bad
        nice mafFrags hg18 multiz17way up.bed upstream$i.maf \
                -orgs=../species.lst
        rm up.bed
    end
    date
'EOF'

    time csh mafFrags.csh > mafFrags.log 
    nice gzip up*.maf

    ssh kkstore02
    cd /cluster/data/hg18/bed/multiz17way/mafDownloads
cat > downloads.csh << 'EOF'
    date
    foreach f (../maf/chr*.maf)
	set c = $f:t:r
        echo $c
	nice gzip -c $f > $c.maf.gz
    end
    md5sum *.gz > md5sum.txt
    date
'EOF'
    time csh downloads.csh > downloads.log

    ssh hgwdev
    set dir = /usr/local/apache/htdocs/goldenPath/hg18/multiz17way
    mkdir $dir
    ln -s /cluster/data/hg18/bed/multiz17way/mafDownloads/{*.gz,md5sum.txt} $dir


#####################################################################
# SEGMENTAL DUPLICATIONS (DONE 6/30/06 angie)
    # File emailed from Xinwei She <xws@u.washington.edu>
    mkdir /cluster/data/rheMac2/bed/genomicSuperDups
    cd /cluster/data/rheMac2/bed/genomicSuperDups
    sed -e 's/\t_\t/\t-\t/' rheMac2_genomicSuperDup.tab \
    | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \
    | hgLoadBed rheMac2 genomicSuperDups stdin\
      -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql


######
# ENSEMBL (done  6/20/06 robert)
wget http://www.sanger.ac.uk/Users/lec/macaque_data/gtf.tar.gz
wget http://www.sanger.ac.uk/Users/lec/macaque_data/macaque_ensembl_cdnas.fa.gz
wget -q http://www.sanger.ac.uk/Users/lec/macaque_data/macaque_ensembl_peptides.fa.gz 
gunzip *.gz
for i in `ls ensembl_macaque_gtf_data/` ; do echo ${i%%.gtf} ; awk '{print "chr"$0}' ensembl_macaque_gtf_data/$i > chr$i ; done
rm -f chrscaffold.gtf
cp ensembl_macaque_gtf_data/scaffold.gtf scaffold.gtf 
cat chr*.gtf > ensGene.gtf
grep -v pseudo ensAll.gtf > ensGene.gtf
ldHgGene rheMac2 ensGene ensGene.gtf -gtf -genePredExt
Reading ensGene.gtf
Read 38561 transcripts in 741623 lines in 1 files
  38561 groups 22 seqs 8 sources 4 feature types
38561 gene predictions
hgsql rheMac2 < ~/kent/src/hg/lib/ensGtp.sql
awk '{print $10,$12,$16}' ensGene.gtf |grep ENSMMUP|awk 'NF==3{print $0}'|sed -e 's/\"//g' | sed -e 's/\;//g'|sed -e 's/ /\t/g' |sort -u > ensGtp.tab
echo "load data local infile 'ensGtp.tab' into table ensGtp ignore 1 lines" | hgsql -N rheMac2
sed -e 's/_/ /' macaque_ensembl_peptides.fa  > ensPep.fa
hgPepPred rheMac2  ensembl ensPep.fa
grep pseudogene ensAll.gtf > ensPseudo.gtf
grep -v protein_coding ensAll.gtf |grep -v pseudo > ensRna.gtf

##########################################################################
# N-SCAN gene predictions (nscanGene) - (2006-08-04 markd)
    cd /cluster/data/rheMac2/bed/nscan/

    # obtained NSCAN predictions from michael brent's group
    # at WUSTL
    wget -nv -r -np http://ardor.wustl.edu/jeltje/rheMac2/chr_gtf
    wget -nv -r -np http://ardor.wustl.edu/jeltje/rheMac2/proteins
    # clean up and rename downloaded directorys:
    mv ardor.wustl.edu/jeltje/rheMac2/chr_gtf .
    mv ardor.wustl.edu/jeltje/rheMac2/proteins chr_ptx
    rm -rf ardor.wustl.edu
    rm chr_*/index.html*
    gzip chr_*/*
    chmod a-w chr_*/*.gz

    # load tracks.  Note that these have *utr features, rather than
    # exon features.  currently ldHgGene creates separate genePred exons
    # for these.
    ldHgGene -bin -gtf -genePredExt rheMac2 nscanGene chr_gtf/chr*.gtf.gz
    # add .a suffix to match transcript id

    # add .a suffix to match transcript id
    hgPepPred -suffix=.a rheMac2 generic nscanPep chr_ptx/chr*.fa.gz
    rm *.tab

    # update trackDb; need a rheMac2-specific page to describe informants
    rhesus/rheMac2/nscanGene.html   (copy from panTro2 and edit)
    rhesus/rheMac2/trackDb.ra

    # QA found:
     Checking keys on database rheMac2
     rheMac2.nscanPep.name - hits 22192 of 24608
     Error: 2416 of 24608 elements of rheMac2.nscanPep.name are not in key nscanGene.name line 524 of all.joiner
     Example miss: CHR1.5.052.A
     # due to WUStL giving us a mismatched protein file.

    # 2006-08-11 markd - obtained new protein sequences
    wget -nv -r -np http://ardor.wustl.edu/jeltje/rheMac2/proteins
    hgPepPred -suffix=.a rheMac2 generic nscanPep proteins/chr*.fa.gz

    joinerCheck -keys -database=rheMac2 -identifier=nscanName  ~/compbio/kent/src/hg/makeDb/schema/all.joiner
    Checking keys on database rheMac2
    rheMac2.nscanPep.name - hits 22003 of 22003 ok

    # 2006-08-11 markd - N-SCAN track has UTR only predictions, reload discarding
    # these
    ldHgGene -bin -gtf -requireCDS -genePredExt rheMac2 nscanGene chr_gtf/chr*.gtf.gz
# CPGISLANDS (DONE - 2006-09-12 - Robert)
    ssh hgwdev
    mkdir -p /cluster/data/rheMac2/bed/cpgIsland
    cd /cluster/data/rheMac2/bed/cpgIsland

    # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
    cvs co hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    make
    #	gcc readseq.c cpg_lh.c -o cpglh.exe
    mv cpglh.exe /cluster/data/rheMac2/bed/cpgIsland/
    
    # cpglh.exe requires hard-masked (N) .fa's.  
    # There may be warnings about "bad character" for IUPAC ambiguous 
    # characters like R, S, etc.  Ignore the warnings.  
    ssh kkstore02
    cd /cluster/data/rheMac2/bed/cpgIsland
    foreach f (../../*/chr*.fa.masked)
      set fout=$f:t:r:r.cpg
      echo running cpglh on $f to $fout
      ./cpglh.exe $f > $fout
    end
    # Transform cpglh output to bed +
    cat << '_EOF_' > filter.awk
/* Input columns: */
/* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
/* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
/* Output columns: */
/* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
/* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
    # << this line makes emacs coloring happy
    awk -f filter.awk chr*.cpg > cpgIsland.bed

    ssh hgwdev
    cd /cluster/data/rheMac2/bed/cpgIsland
    hgLoadBed rheMac2 cpgIslandExt -tab -noBin \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
#Loaded 24226 elements of size 10
#Sorted
#Saving bed.tab
#Loading rheMac2

##########################################################################
# Hg 18 Reciprocal Best Nets  (8-13-07, kpollard)

ssh kolossus
cd /cluster/data/hg18/bed/blastz.rheMac2/axtChain
mkdir rBestNet
cp /cluster/data/rheMac2/chrom.sizes /cluster/bluearc/rheMac2/.
cat > rbest.csh << 'EOF'
    set swapChain = rheMac2ToHg18.swap.chain
    echo "merge and swap $swapChain"
    zcat /cluster/data/rheMac2/bed/liftOver/rheMac2ToHg18.over.chain.gz \ 
         | chainSwap stdin stdout | chainMergeSort stdin > $swapChain
    echo "split chain"
    chainSplit swapChain/ $swapChain
    echo "run netter"
    chainPreNet $swapChain /cluster/bluearc/hg18/chrom.sizes \
         /cluster/bluearc/rheMac2/chrom.sizes stdout | \
         chainNet stdin -minSpace=1 /cluster/bluearc/hg18/chrom.sizes \
         /cluster/bluearc/rheMac2/chrom.sizes stdout /dev/null | \
         netSyntenic stdin rbest.noClass.net
    echo "split net"
    netSplit rbest.noClass.net rBestNet
    echo "extract chains from net"
    netChainSubset -verbose=0 rbest.noClass.net $swapChain stdout | \ 
         chainMergeSort stdin | gzip -c > hg18.rheMac2.rbest.chain.gz
    echo "split extracted chain"
    chainSplit rBestChain/ hg18.rheMac2.rbest.chain.gz 
    cd ..
    mkdir axtRBestNet
    echo "extract axts"
    foreach f (axtChain/rBestNet/*.net)
        echo $f:t:r
        netToAxt $f axtChain/swapChain/$f:t:r.chain \
            /san/sanvol1/scratch/hg18/hg18.2bit \
            /cluster/bluearc/rheMac2/rheMac2.2bit stdout | \
            axtSort stdin stdout | gzip -c \
            > axtRBestNet/$f:t:r.hg18.rheMac2.net.axt.gz
    end
'EOF'
csh rbest.csh >&! rbest.log &
#Not yet loaded

############################################################################
##  BLASTZ swap from monDom4 alignments (2007-11-11 - markd)
    ssh hgwdev
    mkdir /cluster/data/rheMac2/bed/blastz.monDom4.swap
    cd /cluster/data/rheMac2/bed/blastz.monDom4.swap
    ln -s blastz.monDom4.swap ../blastz.monDom4
    /cluster/bin/scripts/doBlastzChainNet.pl \
        -swap /cluster/data/monDom4/bed/blastz.rheMac2/DEF >& swap.out&

    # fb.rheMac2.chainMonDom4Link.txt:
    # 611785347 bases of 2646704109 (23.115%) in intersection

#########################################################################
# Blastz Marmoset calJac1 (DONE - 2007-11-15 - Hiram)
    ssh kkstore01
    screen # use screen to control this job
    mkdir /cluster/data/rheMac2/bed/blastzCalJac1.2007-11-16
    cd /cluster/data/rheMac2/bed/blastzCalJac1.2007-11-16

    cat << '_EOF_' > DEF
# Rhesus vs marmoset
BLASTZ_M=50

# TARGET: Rhesus RheMac2
SEQ1_DIR=/scratch/hg/rheMac2/rheMac2.2bit
SEQ1_LEN=/cluster/data/rheMac2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Marmoset calJac1
SEQ2_DIR=/cluster/bluearc/scratch/data/calJac1/calJac1.2bit
SEQ2_LEN=/cluster/data/calJac1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/rheMac2/bed/blastzCalJac1.2007-11-16
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-bigClusterHub=kk > blastz.log 2>&1 &
    #	real    2115m6.616s
    # failed during the load due to MySQL problems on hgwdev
    #	run loadUp.csh manually on hgwdev
    #	real    28m35.926s
    cat fb.rheMac2.chainCalJac1Link.txt
    #	2055107003 bases of 2646704109 (77.648%) in intersection
    time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-continue=download -chainMinScore=3000 -chainLinearGap=medium \
	-bigClusterHub=kk > download.log 2>&1 &

    mkdir /cluster/data/calJac1/bed/blastz.rheMac2.swap
    cd /cluster/data/calJac1/bed/blastz.rheMac2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/rheMac2/bed/blastzCalJac1.2007-11-16/DEF \
	-swap -chainMinScore=3000 -chainLinearGap=medium \
	-bigClusterHub=kk > swap.log 2>&1 &
    #	real    349m36.073s
    cat fb.calJac1.chainRheMac2Link.txt
    #	2191300051 bases of 2929139385 (74.810%) in intersection

###########################################################################
# Blastz Orangutan ponAbe2 (DONE - 2007-11-16 - Hiram)
    ssh kkstore01
    screen # use screen to control this job
    mkdir /cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16
    cd /cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16

    cat << '_EOF_' > DEF
# Rhesus vs marmoset
BLASTZ_M=50

# TARGET: Rhesus RheMac2
SEQ1_DIR=/scratch/hg/rheMac2/rheMac2.2bit
SEQ1_LEN=/cluster/data/rheMac2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Orangutan ponAbe2
SEQ2_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit
SEQ2_LEN=/cluster/data/ponAbe2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-bigClusterHub=kk > blastz.log 2>&1 &
    # something got lost during the para make.  A couple of jobs couldn't
    #	finish, due to memory limits.  Finish them off on hgwdev
    #	real    1911m11.906s
# Completed: 108924 of 108928 jobs
# Crashed: 4 jobs
# CPU time in finished jobs:   20658708s  344311.80m  5738.53h  239.11d  0.655 y
# IO & Wait Time:               1801215s   30020.25m   500.34h   20.85d  0.057 y
# Average job time:                 206s       3.44m     0.06h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            5401s      90.02m     1.50h    0.06d
# Submission to last job:        111924s    1865.40m    31.09h    1.30d
    #	continuing after those were done:
    time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-continue=cat -chainMinScore=3000 -chainLinearGap=medium \
	-bigClusterHub=kk > cat.log 2>&1 &

    # some jobs failed due to memory limits on kk nodes
    #	finished them manually on kolossus and hgwdev, then continuing
    time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-continue=cat -bigClusterHub=kk > cat.log 2>&1 &
    #	real    322m54.734s
    #	failed during load due to MySQL problems on hgwdev
    #	run loadUp.csh manually on hgwdev
    #	real    22m22.225s
    cat fb.rheMac2.chainPonAbe2Link.txt
    #	2333264093 bases of 2646704109 (88.157%) in intersection
    time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-continue=download -bigClusterHub=kk > download.log 2>&1 &

    #	and, for the swap
    mkdir /cluster/data/ponAbe2/bed/blastz.rheMac2.swap
    cd /cluster/data/ponAbe2/bed/blastz.rheMac2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16/DEF \
	-swap -chainMinScore=3000 -chainLinearGap=medium \
	-bigClusterHub=kk > swap.log 2>&1 &
    #	real    289m9.479s
    cat fb.ponAbe2.chainRheMac2Link.txt
    #	2556010573 bases of 3093572278 (82.623%) in intersection

############################################################################
# SGP GENES (DONE - 2007-12-20 - Hiram)
    ssh kkstore01
    mkdir  /cluster/data/rheMac2/bed/sgp
    cd  /cluster/data/rheMac2/bed/sgp

    mkdir gtf
for C in `awk '{print $1}' /cluster/data/rheMac2/chrom.sizes`
do
    wget --timestamping \
"http://genome.imim.es/genepredictions/M.mulatta/rmJan2006/SGP/200603mm8/${C}.gtf" \
        -O "gtf/${C}.gtf"
done

    ssh hgwdev
    cd /cluster/data/rheMac2/bed/sgp
    ldHgGene -gtf -genePredExt rheMac2 sgpGene gtf/chr*.gtf
    # Read 31416 transcripts in 264479 lines in 22 files
    # 31416 groups 22 seqs 1 sources 3 feature types
    # 31416 gene predictions

    featureBits rheMac2 sgpGene
    #	32556064 bases of 2646704109 (1.230%) in intersection

    featureBits rheMac2 -enrichment refGene:CDS sgpGene
# refGene:CDS 0.017%, sgpGene 1.230%, both 0.014%, cover 83.53%, enrich 67.91x

#####################################################################
# LOAD GENEID GENES (DONE - 2007-12-20 - Hiram)
    ssh kkstore01
    mkdir -p /cluster/data/rheMac2/bed/geneid
    cd /cluster/data/rheMac2/bed/geneid
    mkdir gtf prot

for C in `awk '{print $1}' /cluster/data/rheMac2/chrom.sizes`
do
  echo $C
  wget --timestamping \
"http://genome.imim.es/genepredictions/M.mulatta/rmJan2006/geneid_v1.2/${C}.gtf" \
	-O "gtf/${C}.gtf"
  wget --timestamping \
"http://genome.imim.es/genepredictions/M.mulatta/rmJan2006/geneid_v1.2/${C}.prot" \
	-O "prot/${C}.prot"
done

    # Add missing .1 to protein id's
    cd prot
    foreach f (*.prot)
      perl -wpe 's/^(>chr\S+)$/$1.1/' $f > $f:r-fixed.prot
    end

    ssh hgwdev
    cd /cluster/data/rheMac2/bed/geneid
    ldHgGene -genePredExt -gtf rheMac2 geneid gtf/*.gtf
# Read 30202 transcripts in 248094 lines in 22 files
# 30202 groups 22 seqs 1 sources 3 feature types
# 30202 gene predictions

    hgPepPred rheMac2 generic geneidPep prot/chr*-fixed.prot
    featureBits rheMac2 geneid
    #	33978840 bases of 2646704109 (1.284%) in intersection

    featureBits rheMac2 -enrichment refGene:CDS geneid
# refGene:CDS 0.017%, geneid 1.284%, both 0.013%, cover 77.14%, enrich 60.09x

##########################################################################
############################################################################
# TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30

see doc/builds.txt for specific details.
############################################################################

################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
rheMac2.upstreamGeneTbl = refGene
############################################################################
# TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.

############################################################################
# calJac3 Marmoset BLASTZ/CHAIN/NET (DONE - 2010-02-11 - Hiram)
    screen # use a screen to manage this multi-day job
    mkdir /hive/data/genomes/rheMac2/bed/lastzCalJac3.2010-02-11
    cd /hive/data/genomes/rheMac2/bed/lastzCalJac3.2010-02-11

    #	same kind of parameters as used in human<->marmoset
    cat << '_EOF_' > DEF
# Rhesus vs. marmoset
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Rhesus RheMac2
SEQ1_DIR=/scratch/data/rheMac2/rheMac2.2bit
SEQ1_LEN=/scratch/data/rheMac2/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Marmoset (calJac3)
SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit
SEQ2_LEN=/scratch/data/calJac3/chrom.sizes
SEQ2_LIMIT=50
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/rheMac2/bed/lastzCalJac3.2010-02-11
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet \
	-chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    248m5.651s
    cat fb.rheMac2.chainCalJac3Link.txt 
    #	1871513554 bases of 2646704109 (70.711%) in intersection

    mkdir /hive/data/genomes/calJac3/bed/blastz.rheMac2.swap
    cd /hive/data/genomes/calJac3/bed/blastz.rheMac2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/rheMac2/bed/lastzCalJac3.2010-02-11/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    142m34.894s
    cat fb.calJac3.chainHg19Link.txt 
    #	1916431926 bases of 2752505800 (69.625%) in intersection

############################################################################
# refSeqAnno - RefSeq genomic annotations at the request of
# Kim Pruit <pruitt@ncbi.nlm.nih.gov> for use by the Rhesus community.
# (2010-10-14 markd)

    checkout markds python's tools in some directory:
       svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/code/pycbio/trunk
    and run make at top

    mkdir /hive/data/genomes/rheMac2/bed/refSeqAnno
    cd /hive/data/genomes/rheMac2/bed/refSeqAnno

    # get NCBI genome annotation:
    wget ftp://ftp.ncbi.nih.gov/genomes/Macaca_mulatta/README_CURRENT_BUILD
    wget -nv -r -A 'mmu_ref_*.gbs.gz' 'ftp://ftp.ncbi.nih.gov/genomes/Macaca_mulatta/CHR*'
    mkdir gbs
    mv ftp.ncbi.nih.gov/genomes/Macaca_mulatta/*/*.gbs.gz gbs/
    rm -rf ftp.ncbi.nih.gov

    # get mRNAs to create mRNA <-> protein mappings
    wget ftp://ftp.ncbi.nih.gov/genomes/Macaca_mulatta/RNA/rna.gbk.gz
    zcat rna.gbk.gz |awk '/^VERSION/{mrna=$2} /\/protein_id=/{print mrna"\t"gensub("^.*\"(.*)\"","\\1","g",$1)}'  >mrna2prot.tbl

    zcat gbs/*.gz |~/compbio/code/pycbio/bin/gbffGenesToGenePred --mrnaProtMap=mrna2prot.tbl --refSeqOnly --ext --allGenes --other /dev/stdin refSeqAnno.contig.gp >&cnv.probs

    # no lift file was made, so must build one
    ftp://ftp.ncbi.nih.gov/genomes/Macaca_mulatta/seq_contig.md.gz
    mdToNcbiLift seq_contig.md.gz ncbi.lft
    # need to fix various things in lift:
    #    - drop /start and /end entries
    #    - we don't have chromsome chrUn
    #    - mappings to weird chome names like: chrX|NW_001219121.1
    tawk '$2~/\/start/ || $2~/\/end/ || $4 ~/chrUn/{next} {$4=gensub("[|].*", "", "g",$4); print $0}' ncbi.lft >ncbi.fix.lft 

    # lift to chromosomes
    liftUp -type=.genepred -extGenePred refSeqAnno.gp ncbi.fix.lft warn refSeqAnno.contig.gp >&lift.probs
    # 635 drop due to not having chrUn

    genePredCheck -db=rheMac2 refSeqAnno.gp 
    hgLoadGenePred -genePredExt rheMac2 refSeqAnno refSeqAnno.gp 

    # add to 
      trackDb/rhesus/trackDb.ra
      trackDb/rhesus/rheMac2/trackDb.ra 
    # create
      trackDb/rhesus/refSeqAnno.html

############################################################################
# LASTZ Gorilla GorGor3 (DONE - 2011-10-21,23 - Hiram)
    mkdir /hive/data/genomes/rheMac2/bed/lastzGorGor3.2011-10-21
    cd /hive/data/genomes/rheMac2/bed/lastzGorGor3.2011-10-21

    cat << '_EOF_' > DEF
# Gorilla vs Rhesus
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
BLASTZ_O=600
BLASTZ_E=150
# other parameters on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Gorilla rheMac2
SEQ1_DIR=/scratch/data/rheMac2/rheMac2.2bit
SEQ1_LEN=/scratch/data/rheMac2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=30

# QUERY: Gibbon gorGor3
SEQ2_DIR=/scratch/data/gorGor3/gorGor3.2bit
SEQ2_LEN=/scratch/data/gorGor3/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/hive/data/genomes/rheMac2/bed/lastzGorGor3.2011-10-21
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	1729m35s
    cat fb.rheMac2.chainGorGor3Link.txt  
    #	2175264883 bases of 2646704109 (82.188%) in intersection

    cd /hive/data/genomes/rheMac2/bed
    ln -s lastzGorGor3.2011-10-21 lastz.gorGor3

    #	running the swap - DONE - 2011-09-21
    mkdir /hive/data/genomes/gorGor3/bed/blastz.rheMac2.swap
    cd /hive/data/genomes/gorGor3/bed/blastz.rheMac2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/rheMac2/bed/lastzGorGor3.2011-10-21/DEF \
	-swap -syntenicNet \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> swap.log 2>&1 &
    #	real    85m16.618s
    cat fb.gorGor3.chainRheMac2Link.txt
    #	2236867094 bases of 2822760080 (79.244%) in intersection

############################################################################
