#!/bin/csh -f
exit

# This is the make doc for the hg17 ENCODE data.
# NOTE: many of these tracks were lifted from hg16 with
# semi-automated processing. The liftOver leftovers were moved
# to the subdirectories "mapped" and "unmapped" of the main\
# work area, /cluster/data/encode/convertHg17

    # create work area
    mkdir /cluster/data/encode/convertHg17
    cd  /cluster/data/encode/convertHg17
    ln -s /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain hg16ToHg17.chain

    # Inventory ENCODE tables on hg16 (hgwbeta)

    ssh hgwbeta "echo select tableName from trackDb where tableName like \'encode%\' and settings not like \'%composite%\' order by tableName | hgsql hg16" > tables.txt
    wc -l tables.txt
        #     350 tables.txt

    set encodeBin = /cluster/data/encode/bin/scripts
    csh $encodeBin/listEncodeTables.csh hg16 > tableTypes.txt
    grep bed tableTypes.txt > tables.bed.txt

##########################################################################
# DOWNLOADS

    ssh hgwdev
    cd /usr/local/apache/htdocs/hg17
    mkdir -p encode
    cd encode
    # release terms
    cp ../../hg16/encode/README.txt .
    # annotation database
    # request admin set up automated database dump
    mkdir database
    # auxiliary data files
    mkdir datafiles 
    # sequences
    mkdir regions
    cp ../../hg16/encode/regions/README.txt regions
    # edit README
    cd /cluster/data/encode/convertHg17
    hgsql hg17 -N -e \
      "SELECT name, chrom, chromStart, chromEnd FROM encodeRegions ORDER BY name">regions.txt 

    ssh kolossus
    cd /cluster/data/encode/convertHg17
    mkdir regions
    cd regions
    /cluster/data/encode/bin/scripts/encodeSequences.pl -upper \
        ../regions.txt /iscratch/i/hg17/nib  > hg17.fa
    /cluster/data/encode/bin/scripts/encodeSequences.pl -masked \
        ../regions.txt /iscratch/i/hg17/nib  > hg17.msk.fa
    faSize detailed=on hg17.fa > hg17_count.txt
    gzip *.fa
    md5sum *.fa.gz > md5sum.txt
    # copy regions/README.txt from hg16 and edit

    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17/encode
    ln -s /cluster/data/encode/convertHg17/regions .

    # October MSA freeze
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17/encode
    mkdir alignments
    ln -s /cluster/data/encode/downloads/msa/SEP-2005 .
    # terms of use
    cp /usr/local/apache/htdocs/goldenPath/hg16/encode/alignments/README.txt .

###########################################################################
###########################################################################
# Tracks lifted from hg16

##########################################################################
# GIS PET  (2005-08-23 kate)
# New genome-wide data (cMyc) submitted by Chialin (2006-10-25)

    cd  /cluster/data/encode/convertHg17

    # use mysqldump to generate .sql w/ schema, and .txt with data
    set t = encodeGisRnaPetHCT116
    $encodeBin/dumpTable.csh hg16 $t
    wc -l $t.txt
        # 112782 encodeGisRnaPetHCT116.txt

    # create table
    hgsql hg17 < $t.sql

    # convert data coordinates
    ~/bin/i386/liftOver $t.txt -hasBin -bedPlus=12 \
            hg16ToHg17.chain $t.tab $t.unmapped
    wc -l $t.tab $t.unmapped
         # 112701 encodeGisRnaPetHCT116.tab
         #    162 encodeGisRnaPetHCT116.unmapped

    # load into database
    echo "LOAD DATA local INFILE '$t.tab' INTO TABLE $t" | hgsql hg17
    hgsql hg17 -N -s -e "SELECT COUNT(*) FROM $t"
        # 112701
    checkTableCoords hg17 $t

    # Now try scripted version
    csh $encodeBin/convertBedTable.csh hg16 hg17 encodeGisRnaPetMCF7 12
        # encodeGisRnaPetMCF7     hg16 104304   hg17 104187
    csh $encodeBin/convertBedTable.csh hg16 hg17 encodeGisChipPet 12
        # encodeGisChipPet        hg16 65513   hg17 65510

    # 2006-10-25 cMyc data
    cd /cluster/data/encode/GIS
    mkdir -p cMyc/2006-10-25/lab
    # copy files from ftp dir
    cd cMyc/2006-10-25
    # use Angie's methods from hg16 to generate score from cluster count
    grep '^chr' lab/GIS_c-Myc_P493.bed | \
        perl -wpe 'chomp; @w = split; \
                 if ($w[3] =~ /^\d+-(\d+)$/) { \
                   $w[4] = ($1 >= 4 ? 1000 : ($1 >= 3 ? 800 : 333)); \
                 } else { die "parse"; } \
                 $_ = join("\t", @w) . "\n";' > myc.bed
    hgLoadBed -strict hg17 encodeGisChipPetMycP493 myc.bed
        # Loaded 276788 elements of size 12
    checkTableCoords encodeGisChipPetMycP493

    # Create a composite track and merge in P53 and STAT1 data
    # as subtracks

##########################################################################
# KNOWN+PRED RNA (2005-08-29 kate)

    cd  /cluster/data/encode/convertHg17
    grep encodeRna tables.bed.txt
        # encodeRna       encodeGenes     bed 6 +

    $encodeBin/convertBedTable.csh hg16 hg17 encodeRna 6

##########################################################################
# TBA23 Evofold (2005-08-23 kate)

    cd  /cluster/data/encode/convertHg17
    csh $encodeBin/convertBedTable.csh hg16 hg17 encode_tba23EvoFold 6
            # 739 encode_tba23EvoFold.txt
        # Reading liftover chains
        # Mapping coordinates
            # 739 encode_tba23EvoFold.tab
              # 0 encode_tba23EvoFold.unmapped
            # 739 total
        # encode_tba23EvoFold     hg16 739   hg17 739



##########################################################################
# Transcription Levels Group
# BU FIRST EXON
    grep encodeBu tables.bed.txt
        # encodeBuFirstExonCerebrum       encodeTxLevels  bed 12 +
        # encodeBuFirstExonColon  encodeTxLevels  bed 12 +
        # encodeBuFirstExonHeart  encodeTxLevels  bed 12 +
        # encodeBuFirstExonKidney encodeTxLevels  bed 12 +
        # encodeBuFirstExonLiver  encodeTxLevels  bed 12 +
        # encodeBuFirstExonLung   encodeTxLevels  bed 12 +
        # encodeBuFirstExonSkMuscle       encodeTxLevels  bed 12 +
        # encodeBuFirstExonSpleen encodeTxLevels  bed 12 +
        # encodeBuFirstExonStomach        encodeTxLevels  bed 12 +
        # encodeBuFirstExonTestis encodeTxLevels  bed 12 +
    
    set buTables = `echo "SHOW TABLES LIKE 'encodeBuFirstExon%'" | hgsql -N -s hg16`
    foreach t ($buTables)
        csh $encodeBin/convertBedTable.csh hg16 hg17 $t 12
        checkTableCoords hg17 $t
    end


# RIKEN CAGE
    grep encodeRikenCage tables.bed.txt
        # encodeRikenCageMinus    encodeTxLevels  bedGraph 4
        # encodeRikenCagePlus     encodeTxLevels  bedGraph 4

    csh $encodeBin/convertBedTable.csh hg16 hg17 encodeRikenCageMinus 4
        # Creating hg16 encodeRikenCageMinus.sql and encodeRikenCageMinus.txt
           # 6156 encodeRikenCageMinus.txt
        # Reading liftover chains
        # Mapping coordinates
           # 6153 encodeRikenCageMinus.tab
              # 6 encodeRikenCageMinus.unmapped
           # 6159 total
        # encodeRikenCageMinus    hg16 6156   hg17 6153

    csh $encodeBin/convertBedTable.csh hg16 hg17 encodeRikenCagePlus 4
            # csh $encodeBin/convertBedTable.csh hg16 hg17 encodeRikenCagePlus 4
        # Creating hg16 encodeRikenCagePlus.sql and encodeRikenCagePlus.txt
           # 5688 encodeRikenCagePlus.txt
        # Reading liftover chains
        # Mapping coordinates
           # 5639 encodeRikenCagePlus.tab
             # 98 encodeRikenCagePlus.unmapped
           # 5737 total
        # encodeRikenCagePlus     hg16 5688   hg17 5639


##########################################################################
# CHIP/CHIP GROUP
# 
# STANFORD CHIP
# encodeStanfordChip* bedGraph 4 tracks

cat > doStan.csh << 'EOF'
    set stanTables = \
        `echo "SHOW TABLES LIKE 'encodeStanfordChip%'" | hgsql -N -s hg16`
    foreach t ($stanTables)
        csh /cluster/data/encode/bin/scripts/convertBedTable.csh \
                hg16 hg17 $t 4
    end
'EOF'
    csh doStan.csh >&! doStan.log
    grep hg17 doStan.log | wc -l
        # 12 tracks (6 smoothed)
        # encodeStanfordChipHCT116Sp1     hg16 369633   hg17 369465
        # encodeStanfordChipSmoothedHCT116Sp1     hg16 137439   hg17 137361

# UCD Ng
        csh $encodeBin/convertBedTable.csh hg16 hg17 encodeUCDavisE2F1Median 4
        # encodeUCDavisE2F1Median hg16 382884   hg17 382713

# UCSD/LI CHIP
# encodeUcsdChip* bedGraph 4 tracks (total 36)

cat > doUcsd.csh << 'EOF'
    set ucsdTables = \
        `echo "SHOW TABLES LIKE 'encodeUcsdChip%'" | hgsql -N -s hg16`
    foreach t ($ucsdTables)
        csh /cluster/data/encode/bin/scripts/convertBedTable.csh \
                hg16 hg17 $t 4
    end
'EOF'
    csh doUcsd.csh >&! doUcsd.log
    grep hg17 doUcsd.log | wc -l
    # 36 tracks
    # encodeUcsdChipAch3Imr90 hg16 24348   hg17 24339
    # encodeUcsdChipHeLaH3H4tmH3K4_p30        hg16 24537   hg17 24528


##########################################################################
# TRANSCRIPTION LEVELS TRACKS (2005-08-24 kate)

    # grep encodeTxLevels in tables.bed.txt and edit out already
    # completed tracks.  Prefix each table with a call to convertBedTable
    # and suffix with bed field count
    # Tracks are: Stanford RTPCR, Yale TARS

    csh doTx.csh >&! doTx.log
    grep hg17 doTx.log | wc -l
        # 9 tracks

##########################################################################
# CHROMATIN & CHROMOSOMES TRACKS (2005-08-24 kate)

    # Regulome, NHGRI DNase, Stanford Meth, UVA
    csh doChrom.csh >&! doChrom.log
    # 37 tables
    # do Stanford Meth Smoothed tables that weren't converted because
        # hg16 tables had incorrect capitalization wrt trackDb
        # and so weren't being displayed
    csh doChrom2.csh >&! doChrom2.log

##########################################################################
# CHIP/CHIP TRACKS (2005-08-24 kate)
# Sanger, UCSD Nimblegen

    doChip.csh >&! doChip.log

##########################################################################
# VARIATION TRACKS (2005-08-24 kate)
#  HapMap, Reseq, Sanger Gene Expr

    csh doVar.csh >&! doVar.log
    grep hg17 doVar.log
        # encodeReseqRegions      hg16 10   hg17 10
        # encodeSangerGenoExprAssociation hg16 13674   hg17 13674
    csh doHap.csh >&! doHap.log
    grep hg17 doHap.log
        # encodeHapMapAlleleFreqCEU       hg16 20772   hg17 20772
        # encodeHapMapAlleleFreqCHB       hg16 19629   hg17 19629
        # encodeHapMapAlleleFreqJPT       hg16 19629   hg17 19629
        # encodeHapMapAlleleFreqYRI       hg16 19520   hg17 19520
    csh /cluster/data/encode/bin/scripts/convertBedTable.csh \
                hg16 hg17 encodeRecomb         4


##########################################################################
# AFFY CHIP/CHIP TRACKS (2005-08-24 kate)
    csh doAffy.csh >&! doAffy.log
        # 41 doAffy.csh
    grep hg17 doAffy.log | wc -l
        # 41 

    # do tracks missing from RR!
    csh doAffy2.csh >&! doAffy2.log
    wc -l doAffy2.csh
        # 6 doAffy2.csh
    grep hg17 doAffy2.log | wc -l
        # 6

##########################################################################
# WIG TRACKS (2005-08-24 kate)
        doWig.csh > doWig.log
        # 75 tables

##########################################################################
# YALE TRACKS (2005-08-31 kate)
        doYale.csh > doYale.log
        wc -l doYale.csh
            # 54 doYale.csh
        grep hg17 doYale.log | wc -l
            # 50 
            # redo the 4 that failed
        doYale2.csh > doYale2.log
        grep hg17 doYale2.log | wc -l
            # 4 tracks

!##########################################################################
##########################################################################
# Tracks submitted in hg17 coords

##########################################################################
# GENCODE Sanger Havana annotations  (2005-08-18 kate)
    # Used latest (6/7/05) data submission, which was submitted
    #   in hg17 coords and lifted to hg16.  This was described in makeEncodeHg16.doc

    ssh hgwdev
    cd /cluster/data/encode/Gencode
    cd 2005-06-07

    ldHgGene -gtf -genePredExt hg17 encodeGencodeGene gencode.vega.gtf
        # 2888 gene predictions
    checkTableCoords hg17 encodeGencodeGene

    grep intron gencode.gtf | wc -l
        # 15814
    grep -v not_tested gencode.gtf | sed -e 's/-intron/-/g' | \
        ldGencodeIntron hg17 encodeGencodeIntron stdin
            # 469 introns

    # load gene class table 
    hgsql hg17 < ~/kent/src/hg/lib/gencodeGeneClass.sql
    echo "LOAD DATA LOCAL INFILE 'gencodeGeneClass.tab' into table gencodeGeneClass" | hgsql hg17
    wc -l gencodeGeneClass.tab
        #    2888 gencodeGeneClass.tab


##########################################################################
# EGASP Partial (2005-08-18 kate)
# Gene tracks submitted for the EGASP competition were hg17-based
#       by the Gencode group (Roderic Guigo, Julien Legarde, IMIM) 
# These were lifted to hg17, as described in makeEncodeHg16.doc
# NOTE: Problem with encodeEgaspPartAugustusAny table detected
# and fixed on 2006-01-09.  It was somehow loaded with Genemark full data...
    cd /cluster/data/encode
    cd EGASP/Partial
    wc -l lab/*.gtf
       # 1778 lab/ASPic.gtf
       # 4215 lab/AceSCAN.gtf
       # 2692 lab/Augustus_EST-Protein.gtf
       # 2347 lab/Augustus_abinitio.gtf
       # 2736 lab/Augustus_any.gtf
       # 2567 lab/Augustus_dualgenome.gtf
       # 3458 lab/GeneZilla.gtf
       # 2194 lab/SAGA.gtf
    # NOTE: exclude ASPic, which contains only intron records
    # Filenames above, with _CHR_COORDS_hg17.gff appended, are chrom coordinate versions

    # GeneZilla
    ldHgGene hg17 encodeEgaspPartGenezilla lab/GeneZilla.*.gff
        # 656 gene predictions
    genePredCheck -db=hg17 encodeEgaspPartGenezilla

    # SAGA
    # Strip out trailing ## on lines where manual changes were made
    #   (see notes in .gtf file)
    sed -e 's/ ##.*//' lab/SAGA.*.gff | \
        ldHgGene hg17 encodeEgaspPartSaga stdin
        # 378 gene predictions
    genePredCheck -db=hg17 encodeEgaspPartSaga

    # Augustus
   ln -s lab/Augustus_EST-Protein.gtf_CHR_COORDS_hg17.gff augustus.est.gff
   ln -s lab/Augustus_abinitio.gtf_CHR_COORDS_hg17.gff augustus.abinitio.gff
   ln -s lab/Augustus_any.gtf_CHR_COORDS_hg17.gff augustus.any.gff
   ln -s lab/Augustus_dualgenome.gtf_CHR_COORDS_hg17.gff augustus.dual.gff

    foreach f (augustus.*.gff)
        set t = `echo $f | sed -e 's/augustus.\(.*\).gff/encodeEgaspPartAugustus\u\1/'`
        ldHgGene -genePredExt hg17 $t $f
        checkTableCoords hg17 $t
    end
        # augustus.abinitio.gff 418 gene predictions
        # augustus.any.gff      399 gene predictions
        # augustus.dual.gff     413 gene predictions
        # augustus.est.gff      381 gene predictions

    # Reload .est predictions (2006-01-09 kate)
    ldHgGene -genePredExt hg17 encodeEgaspPartAugustusEst augustus.est.gff
        # augustus.est.gff      381 gene predictions
    checkTableCoords hg17 encodeEgaspPartAugustusEst

    # AceSCAN
    # Split into two tracks -- conserved, and other, based on feature
    ldHgGene -predTab hg17 encodeEgaspPartAceCons aceCons.gp
        # 117 gene predictions
    ldHgGene -predTab hg17 encodeEgaspPartAceOther aceOther.gp
        # 727 gene predictions
    genePredCheck -db=hg17 encodeEgaspPartAceCons encodeEgaspPartAceOther

##########################################################################
# EGASP Full (2005-06-27 kate)
# Gene tracks submitted for the EGASP competition were hg17-based
#       by the Gencode group (Roderic Guigo, Julien Legarde, IMIM) 
    cd /cluster/data/encode
    cd EGASP/Full

    # Process "standard" gff files
    # NOTE: must dummy out scores -- float values
cat > doGene.hg17.csh << 'EOF'
ls *.gp | grep -v hg16 > gpList
foreach f (`cat gpList`)
    wc -l $f 
    set b = $f:r
    set t = encodeEgaspFull$b
    ldHgGene -predTab hg17 $t $f
    genePredCheck -db=hg17 $t
end
'EOF'
csh doGene.hg17.csh >&! doGene.hg17.log

    # process special files
    cd custom
cat > doGene.hg17.csh << 'EOF'
foreach f (Jigsaw.gp Ensembl.gp EnsemblPseudo.gp Exonhunter.gp GeneId.gp Sgp2.gp Twinscan.gp)
    set b = $f:r
    set t = encodeEgaspFull$b
    ldHgGene -genePredExt -predTab hg17 $t $b.gp
    genePredCheck -db=hg17 $t
end
'EOF'
# << for emacs
csh doGene.hg17.csh >&! doGene.hg17.log
    # NOTE: OK to have missing exonFrames
# Reading Ensembl.gp
# 735 gene predictions
# Reading EnsemblPseudo.gp
# 34 gene predictions
# Reading Exonhunter.gp
# 1435 gene predictions
# Reading GeneId.gp
# 476 gene predictions
# Reading Sgp2.gp
# 930 gene predictions
# Reading Twinscan.gp
# 954 gene predictions

end
'EOF'
# << for emacs
csh doGene.hg17.csh >&! doGene.hg17.log

    # process others
    set t = "encodeEgaspFullGenemark"
    ldHgGene -predTab hg17 $t Genemark.gp
        # 890 gene predictions
    genePredCheck -db=hg17 $t

    # create genepreds containing just exons flanking U12 introns
    set t = encodeEgaspFullGeneIdU12
    ldHgGene -predTab -genePredExt hg17 $t geneId.introns.gp
        # 24 gene predictions
    genePredCheck -db=hg17 $t
    set t = encodeEgaspFullSgp2U12
    ldHgGene -predTab -genePredExt hg17 $t sgp2.introns.gp
        # 20 gene predictions
    genePredCheck -db=hg17 $t


##########################################################################
# EGASP Update
# Submitted in hg17 coords

    # Jigsaw
    cd /cluster/data/encode
    cd EGASP/Jigsaw/2005-06-01
    ldHgGene -predTab -genePredExt hg17 encodeEgaspUpdJigsaw jigsaw.gp
        # 454 gene predictions
    genePredCheck -db=hg17 encodeEgaspUpdJigsaw

    # Augustus
    cd /cluster/data/encode
    cd EGASP/Augustus/2005-06-22
    foreach f (abinitio.gp any.gp dual.gp est.gp)
        genePredCheck $f
        set t = `echo $f | sed -e 's/\(.*\).gp/encodeEgaspUpdAugustus\u\1/'`
        ldHgGene -predTab -genePredExt hg17 $t $f
        checkTableCoords hg17 $t
    end
        # Reading abinitio.gp
        # 622 gene predictions
        # Reading any.gp
        # 571 gene predictions
        # Reading dual.gp
        # 617 gene predictions
        # Reading est.gp
        # 543 gene predictions

    # Exogean
    cd /cluster/data/encode
    cd EGASP/Exogean/2005-06-23
    ldHgGene -predTab hg17 encodeEgaspUpdExogean exogean.gp
        # 850 gene predictions
    genePredCheck -db=hg17 encodeEgaspUpdExogean

    # GeneIDU12 and SgpU12
    cd /cluster/data/encode
    cd EGASP/GeneIdU12/2005-06-10/
    # create GTF files from submitted GFF's
    awk -F\\t '/^chr/ {printf "%s\t%s\tCDS\t%s\t%s\t.\t%s\t%s\tgene_id \"%s\"; transcript_id \"%s\"; exon_type \"%s\";\n", $1, $2, $4, $5, $7, $8, $9, $9, $3}' < lab/UCSC-hg17-GeneID-U12-track.gff | grep -v intron > geneId.hg17.gtf
    ldHgGene -genePredExt hg17 encodeEgaspUpdGeneId geneId.hg17.gtf
        # 476 gene predictions
    genePredCheck -db=hg17 encodeEgaspUpdGeneId

    awk -F\\t '/^chr/ {printf "%s\t%s\tCDS\t%s\t%s\t.\t%s\t%s\tgene_id \"%s\"; transcript_id \"%s\"; exon_type \"%s\";\n", $1, $2, $4, $5, $7, $8, $9, $9, $3}' < lab/UCSC-hg17-SGP2-U12-track.gff | grep -v intron > sgp2.hg17.gtf
    ldHgGene -genePredExt hg17 encodeEgaspUpdSgp2 sgp2.hg17.gtf
        # 930 gene predictions
    genePredCheck -db=hg17 encodeEgaspUpdSgp2

    # create genepreds containing just exons flanking U12 introns
    # use U12 annotation as gene name, so it appears on details page
    grep U12 geneId.hg17.gtf | perl -wpe \
 's/(^.*gene_id) (\S+) (.*exon_type) (.*)(U12[^-]+)(.*)/$1 "$5"; $3 $4$5$6/' \
        > geneId.introns.hg17.gtf
    ldHgGene -genePredExt hg17 encodeEgaspUpdGeneIdU12 geneId.introns.hg17.gtf
        # 24 gene predictions

    grep U12 sgp2.hg17.gtf | perl -wpe \
 's/(^.*gene_id) (\S+) (.*exon_type) (.*)(U12[^-]+)(.*)/$1 "$5"; $3 $4$5$6/' \
        > sgp2.introns.hg17.gtf
    ldHgGene -genePredExt hg17 encodeEgaspUpdSgp2U12 sgp2.introns.hg17.gtf
        # 20 gene predictions


    # EGASP Yale Pseudogenes
    # Update submitted by Deyou Zheng 8/18/05
    cd /cluster/data/encode
    cd EGASP/yale/latest
    wc -l lab/*.submitted
        #  184 lab/YalePgene-NCBI35.gtf.submitted
        # NOTE: this is fewer than the previous submission -- I confirmed
        # with Deyou that this is correct.

    # munge to create CDS entries to display, and assign pseudogene
    # name as transcript_id, and pseudogene type as gene_id so
    # it displays on details page
    sed -e 's/pseudogene\t/CDS\t/' -e 's/pgene_type/gene_id/'  \
        -e 's/alt_name ENCODE_Yale/transcript_id /' \
                lab/YalePgene-NCBI35.gtf.submitted > yale.hg17.gtf
    ldHgGene -genePredExt hg17 encodeEgaspUpdYalePseudo yale.hg17.gtf
        # 184 gene predictions 
    genePredCheck -db=hg17 encodeEgaspUpdYalePseudo

    # Fgenesh++
    # Update submitted 9/30/05 by Victor Solovyev to Julien Legarde at
    # IMIM, to fix 4 regions (predictions originally on hg16, redone
    # for hg17)
    cd /cluster/data/encode/EGASP
    mkdir -p Fgenesh/2005-09-30/lab
    cd Fgenesh/2005-09-30/lab
    wget ftp://genome.imim.es/pub/projects/gencode/data/egasp05/submitted_predictions/EGASP_Update/FGenesh++_corrected_update.gtf_CHR_COORDS_hg17.gff
    wget ftp://genome.imim.es/pub/projects/gencode/data/egasp05/submitted_predictions/EGASP_partial/FGenesh++_corrected_partial.gtf_CHR_COORDS_hg17.gff
    cd ..
    cat *.gff | ldHgGene hg17 encodeEgaspUpdFgenesh stdin
    genePredCheck -db=hg17 encodeEgaspUpdFgenesh
        # 820 gene predictions
    

##########################################################################
# STANFORD PROMOTERS
    cd /cluster/data/encode/StanfordPromoters
    rm previous
    mv latest previous
    mkdir 2005-08-23
    ln -s 2005-08-23 latest
    mkdir latest/lab
    # copy updated files from Sara Hartman's email.
    # Both hg16 and hg17 versions were included:
    # hg16: StanfordPromoters_<cell>_08.23.txt
    # hg17: StanfordPromoters_hg17_<cell>_08.24.txt
    # Use Angie's processing from hg16, slightly modified
    cd latest
cat > doProm.csh << 'EOF'
    foreach f (lab/StanfordPromoters_hg17*.txt)
      set cellType = `echo $f | perl -wpe 's^lab/StanfordPromoters_hg17_(.*)_.*^$1^'`
      echo $cellType
      if ($cellType == "Average") then
        tail +2 $f \
        | perl -wpe 'chomp; @w = split("\t"); $w[7] =~ s/^\"(.*)\"$/$1/; \
                     $w[3] =~ tr/01/-+/; \
                     $_ = join("\t", \
  $w[2], $w[4], $w[5], $w[0], $w[9], $w[3], $w[4], $w[5], 0, $w[1], $w[7], \
  $w[8]) . "\n";' \
        | makeColoredBed > encodeStanfordPromoters$cellType.hg17.bed
      else
        tail +2 $f \
        | grep -v "Bad Txfn" \
        | perl -wpe 'chomp; @w = split("\t"); $w[7] =~ s/^\"(.*)\"$/$1/; \
                     $w[3] =~ tr/01/-+/; \
                     $_ = join("\t", \
  $w[2], $w[4], $w[5], $w[0], $w[15], $w[3], $w[4], $w[5], 0, $w[1], $w[7], \
  $w[8], $w[9], $w[10], $w[11], $w[12], $w[13], $w[14]) . "\n";' \
        | makeColoredBed > encodeStanfordPromoters$cellType.hg17.bed
      endif
    end
'EOF'
    csh doProm.csh >&! doProm.log

cat > doLoad.csh << 'EOF'
    foreach f (encode*.bed)
      set track = $f:r:r
      if ($track == "encodeStanfordPromotersAverage") then
        hgLoadBed -tab -noBin -sqlTable=$HOME/kent/src/hg/lib/$track.sql \
          hg17 $track $f
      else
        sed -e "s/encodeStanfordPromoters/$track/" \
          $HOME/kent/src/hg/lib/encodeStanfordPromoters.sql > /tmp/esp.sql
        hgLoadBed -tab -noBin -sqlTable=/tmp/esp.sql hg17 $track $f
      endif
    end
'EOF'
    csh doLoad.csh >&! doLoad.log 

    # Put the negative control data spreadsheet out for download.
    ssh kkstore03
    cd /cluster/data/encode/StanfordPromoters/latest/lab
    nice gzip hg17_NegControlDataStanfordPromoters.txt
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17/encode/datafiles
    mkdir -p stanfordPromoters
    cd stanfordPromoters
    cp -p \
        /cluster/data/encode/StanfordPromoters/latest/lab/hg17_NegControlDataStanfordPromoters.txt.gz \
                NegativeControlDataStanfordPromoters.txt.gz
    # Added a README.txt (edited form Angie's hg16 version)

##########################################################################
# UV Replication -- Segregation, Origins, and Origin Confidence tracks
#       New data for Oct. freeze (but submitted in hg16 coords)
#       All data are bed3
# Contact: Chris Taylor (cmt5n@cs.virginia.edu)
# 2007-04-14: Chris Taylor sent data for the ENm011 region for the 
# Replication track - this data was lost due to a problem with one of 
# Affy's mapping files (hartera). Update is add missing data is only for the 
# Mid, Late and Pan-S subtracks of the UVa DNA Rep Seg track. The Early 
# subtrack does not have data in this region. (DONE, hartera, 2007-04-16)

# New Origins data, by new method (bubble trapping) submitted 2007-05-04
# by Chris Taylor.  2 datasets (GM and HeLa cells) on Affy ENCODE arrays.
# New subtracks added for Ori-Bubble (HeLa) and Ori-NS (HeLa and GM06990)
(hartera, 2007-05-07)
    cd /cluster/data/encode/UVa
    mkdir -p 2005-08-30
    cd 2005-08-30
    mkdir lab

    # Segregation data - 4 subtracks (Early, Mid, Late, Pan-S)
    # 4 custom tracks in a single file -- use Hiram's script to split
    /cluster/data/encode/BU/orchid/2005-06-09/splitTracks.pl \
                lab/segchunks.hg16.qced.bed
    # creates t0, t1, t2, t3
    awk < lab/segchunks.hg16.qced.bed '/track/ {print $2}'
#name=early
#name=mid
#name=late
#name=pans
    grep -v "^track" t0 > encodeUvaDnaRepEarly.hg16.bed
    grep -v "^track" t1 > encodeUvaDnaRepMid.hg16.bed
    grep -v "^track" t2 > encodeUvaDnaRepLate.hg16.bed
    grep -v "^track" t3 > encodeUvaDnaRepPanS.hg16.bed
    rm t0 t1 t2 t3
    foreach f (encodeUvaDnaRep*.hg16.bed)
        set d = $f:r:r
        echo $d
        liftOver $f /cluster/data/encode/convertHg17/hg16ToHg17.chain \
                $d.hg17.bed $d.unmapped
        hgLoadBed -noBin -strict hg17 $d $d.hg17.bed
    end

    # Redo with hg17 resubmitted data
    cd /cluster/data/encode/UVa
    cd 2005-10-15
    /cluster/data/encode/bin/scripts/splitTracks.pl lab/segregation.hg17.bed
    grep -v "^track" t0 > encodeUvaDnaRepEarly.bed
    grep -v "^track" t1 > encodeUvaDnaRepMid.bed
    grep -v "^track" t2 > encodeUvaDnaRepLate.bed
    grep -v "^track" t3 > encodeUvaDnaRepPanS.bed
    rm t0 t1 t2 t3
    foreach f (encodeUvaDnaRep*.bed)
        set d = $f:r
        echo $d
        hgLoadBed -noBin -strict hg17 $d $d.bed
    end

    # Origin predictions -- fixed at 200bp
    set t = encodeUvaDnaRepOriginsPred
    ln -s lab/originspred.hg16.qced.bed $t.hg16.bed
    liftOver $t.hg16.bed \
        /cluster/data/encode/convertHg17/hg16ToHg17.chain \
                $t.hg17.bed $t.unmapped
    hgLoadBed -noBin -strict hg17 $t $t.hg17.bed
        # Loaded 289 elements of size 3

    # Origin confidence intervals -- varying length for averaged origins
    set t = encodeUvaDnaRepOriginsConf
    ln -s  lab/originsconf.hg16.qced.bed $t.hg16.bed
    liftOver $t.hg16.bed \
        /cluster/data/encode/convertHg17/hg16ToHg17.chain \
                $t.hg17.bed $t.unmapped
    hgLoadBed -noBin -strict hg17 $t $t.hg17.bed
        # Loaded 270 elements of size 3

    # Smoothed TR50 data
    #  500K 1bp float scores
    # wiggle with span=1
    set table = encodeUvaDnaRepTr50
    grep -v '^track' lab/smoothedtr50.hg17.wig | \
        wigEncode stdin $table.wig $table.wib
            #  upper limit 6.36, lower limit 2.05
    set dir = /gbdb/hg17/encode/UVa/2005-10-15
    mkdir -p $dir
    hgLoadWiggle -pathPrefix=$dir hg17 $table $table.wig
    ln -s `pwd`/$table.wib $dir

    # Update of tracks to add lost data for ENm011 region for the 
    # Replication track - UVa DNA Rep Seg (hartera). This extra data is
    # in hg17 coordinates.
    cd /cluster/data/encode/UVa/
    mkdir 2007-04-14/lab
    cd 2007-04-14/lab
    # copy data updates here - sent by e-mail:
    # Early_ENm011.bed, Late_ENm011.bed, Mid_ENm011.bed, Pans_ENm011.bed
    
    # Early_ENm011.bed is empty because there were no Early intervals in 
    # ENm011 so no need to update the encodeUvaDnaRepEarly subtrack table.
    cd /cluster/data/encode/UVa/2007-04-14
    grep -v "^track" ./lab/Mid_ENm011.bed > UvaDnaRepMid.bed
    grep -v "^track" ./lab/Late_ENm011.bed > UvaDnaRepLate.bed
    grep -v "^track" ./lab/Pans_ENm011.bed > UvaDnaRepPanS.bed

    foreach s (Mid Late PanS)
       echo $s
       cat /cluster/data/encode/UVa/2005-10-15/encodeUvaDnaRep${s}.bed \
           UvaDnaRep${s}.bed | sort -k1 > encodeUvaDnaRep${s}.bed
    end

    # Reload these tables
    foreach f (encodeUvaDnaRep*.bed)
        set d = $f:r
        echo $d
        hgLoadBed -noBin hg17 $d $d.bed
    end

# New Origins data, by new method (bubble trapping) submitted 2007-05-04
# by Chris Taylor.  2 datasets (GM and HeLa cells) on Affy ENCODE arrays.
# This is for the UVa DNA Rep Ori track (University of Virginia DNA
# Replication Origins track).
    cd /cluster/data/encode/Uva
    mkdir -p 2007-05-04/lab
    cd 2007-05-04/lab
    cp /var/ftp/encode/* .
#-rw-r--r--  1 kate protein  5710 May  3 23:03 Ori-Bubble-HeLa.bed
#-rw-r--r--  1 kate protein  1880 May  3 23:01 Ori-Bubbledescription.txt
#-rw-r--r--  1 kate protein 18379 May  3 23:02 Ori-NS-GM.bed
#-rw-r--r--  1 kate protein 10524 May  3 23:02 Ori-NS-HeLa.bed
#-rw-r--r--  1 kate protein  2127 May  3 23:02 Ori-NSdescription.txt
    # Load data into database (hartera, 2007-05-07)
    # New methods used are Bubble and Nascent Strand (NS).
    # Ori-Bubble-HeLa.bed - Bubble method, HeLa cells
    # Ori-NS-HeLa.bed - Nascent strand method, HeLa cells
    # Ori-NS-GM.bed - Nascent strand method, GM06990 cells 
    cd /cluster/data/encode/UVa/2007-05-04
  
    grep -v "^track" ./lab/Ori-Bubble-HeLa.bed > UvaDnaRepOriginsBubbleHela.bed
    grep -v "^track" ./lab/Ori-NS-HeLa.bed > UvaDnaRepOriginsNSHela.bed
    grep -v "^track" ./lab/Ori-NS-GM.bed > UvaDnaRepOriginsNSGM.bed
   
    foreach f (UvaDnaRepOrigins*.bed)
        set d = $f:r
        echo $d
        hgLoadBed -noBin hg17 encode${d} $d.bed >> load.log
    end
    
    # add trackDb.encode.ra entries for new subtracks. Merge the new method
    # descriptions with the encodeUvaDnaRepOrigins.html description.

    # new data submitted 2007-05-11 to replace the original track (Heavy-light
    # DNA method) and new description including methods for this new data:
    # Ori-TR50.bed and Ori-description.html.
    mkdir -p /cluster/data/encode/UVa/2007-05-11/lab/

    cd /cluster/data/encode/UVa
    ln -s 2007-05-11 latest
    cd 2007-05-11
    # prepare and load the Ori-TR50 data
    grep -v "^track" ./lab/Ori-TR50.bed > UvaDnaRepOriginsTR50Hela.bed
    foreach f (UvaDnaRepOrigins*.bed)
        set d = $f:r
        echo $d
        hgLoadBed -noBin hg17 encode${d} $d.bed >> load.log
    end
    # add trackDb.encode.ra entry for the new subtrack. Use the new
    # description to replace the old one. 

    # There was an extra column in the Ori-TR50.bed file with a confidence 
    # metric that should be removed (Chris Taylor suggested this when asked 
    # about this column).
    # Remove the extra column and re-load table (2007-05-30, hartera)
    cd /cluster/data/encode/UVa/2007-05-11
    rm UvaDnaRepOriginsTR50Hela.bed
    grep -v "^track" ./lab/Ori-TR50.bed \
         | awk 'BEGIN {OFS="\t"} {print $1,$2,$3;}' \
         > UvaDnaRepOriginsTR50Hela.bed
    hgsql -e 'drop table encodeUvaDnaRepOriginsTR50Hela;' hg17
    foreach f (UvaDnaRepOrigins*.bed)
        set d = $f:r
        echo $d
        hgLoadBed -noBin hg17 encode${d} $d.bed >> load.log
    end
    
##########################################################################
# Indels from Jim Mullikin
# Heather, Sept. 2005

    ssh hgwdev
    cd /cluster/data/encode/NHGRI/mullikin/hg17
    hgsql hg17 < encodeIndels.sql
    split4.pl < hg17.ENCODE.DIPtrack.Q23.bed4+ > split4.out
    # use a modified makeColoredBed
    ./makeColoredBed < split4.out > encodeIndels.bed  
    # don't use -strict because we have lots of simple insertions (where chromStart = chromEnd)
    hgLoadBed hg17 encodeIndels -tab -sqlTable=encodeIndels.sql encodeIndels.bed
    # check reference length
    mysql> select chrom, chromStart, chromEnd, (chromEnd-chromStart) as size, traceName, reference, length(reference) as refsize from encodeIndels where (chromEnd-chromStart) != length(reference) and length(reference) > 1;
    # Empty set (0.07 sec)

##########################################################################
# Boston University ORChID track - (2005-09-18 kate)
#	data developer contact:  Jay Greenbaum jj@bu.edu
    ssh hgwdev
    cd /cluster/data/encode/BU
    mkdir -p orchid/2005-09-08/lab
    cd -p orchid/2005-09-08/lab
    wget --timestamping "http://dna.bu.edu/%7Ejj/cleavage_data_hg17/oh_cleavage_hg17.wig.gz"
    cd ..
    mkdir wib
    # NOTE: continue reluctantly with non-standard table name
    # as in hg16
    wigEncode lab/oh_cleavage_hg17.wig.gz \
        encodeBu_ORChID1.wig wib/encodeBu_ORChID1.wib
                # upper limit 1.58, lower limit -0.56
    # load
    set dir = /gbdb/hg17/encode/Bu/2005-09-08
    mkdir -p $dir
    hgLoadWiggle -pathPrefix=$dir hg17 encodeBu_ORChID1 encodeBu_ORChID1.wig
    mkdir -p $dir/wib
    ln -s `pwd`/wib/encodeBu_ORChID1.wib $dir/wib

##########################################################################
# Genome Institute of Singapore -ChIP/PET of STAT1 TFBS (2005-09-29 kate)
# Submitted 9/19 by Atif Shahab
    cd /cluster/data/encode/GIS
    mkdir chip
    mkdir -p 2005-09-19/lab
    ln -s 2005-09-19 latest
    cd latest
    # copy files from FTP dir to lab subdi4
    # files: 2 bed files (stim and nonstim) and 1 doc file
    # use antiword to convert doc file to txt
    ln -s lab/STAT1+stimulation.bed Gif.bed
    ln -s lab/STAT1+w:o_stimulation.bed NoGif.bed

    # Use cluster-count info, now embedded into the name, to make scored BED:
    # (Angie's methods from hg16)
    foreach f (Gif.bed NoGif.bed)
        set d = $f:r
        echo $d
        set table = encodeGisChipPetStat1$d
        perl -wpe 'chomp; @w = split; \
                 if ($w[3] =~ /^\d+-(\d+)$/) { \
                   $w[4] = ($1 >= 4 ? 1000 : ($1 >= 3 ? 800 : 333)); \
                 } else { die "parse"; } \
                 $_ = join("\t", @w) . "\n";' \
               $f > ${table}.tab
       hgLoadBed hg17 $table ${table}.tab
       checkTableCoords hg17 $table
    end
    # Reading encodeGisChipPetStat1Gif.tab
    # Loaded 4007 elements of size 12
    # Reading encodeGisChipPetStat1NoGif.tab
    # Loaded 3180 elements of size 12
    # NOTE: These counts correspond with the doc file they provided
    # Unlike the previous GIS Chip/chip dataset, these are only
    # in the ENCODE regions.  I requested the genome-wide
    # data -- they will provide  this later.


##########################################################################
# Genome Institute of Singapore - PET RNA (2005-10-19 kate)
# Submitted 10/11 by Atif Shahab
#       3 datasets - 5FU treated HCT116 cells, 
#                    MCF7 untreated 
#                    Estrogen-treated MCF7 (new)
# Replace data in existing subtracks, and add new one
    cd /cluster/data/encode/GIS
    mkdir -p rna/2005-10-11/lab
    # copy files from FTP dir
    cd rna/2005-10-11/lab
    ln -s MCF7_estrogen_treated.bed lab/MCF7Estr-hg17.bed

    # use Angie's loading process from hg16
cat > load.csh << 'EOF'
    foreach f (lab/HCT116-hg17.bed lab/MCF7-hg17.bed lab/MCF7Estr-hg17.bed)
      set cellType = `echo $f:t:r | sed -e 's/-hg17//'`
      echo $cellType
      set table = encodeGisRnaPet$cellType
      grep '^chr' $f | \
      perl -wpe \
     'chomp; @w = split; \
      if ($w[3] =~ /\d+-(\d+)-(\d+)/) { \
        ($mc, $ac) = ($1, $2, $3); \
        if ($mc == 1)   { $w[8] = ($ac > 1) ? "35,35,175" : "160,160,188"; } \
        elsif ($mc > 1) { $w[8] = ($ac > 1) ? "180,120,0" : "225,150,0"; } \
        else { die "mc $mc" } \
      } else { die "parse"; } \
      $_ = join(" ", @w) . "\n";'  > $table.bed
      hgLoadBed hg17 $table $table.bed
    end
'EOF'
csh load.csh >&! load.log
    rm *.bed

##########################################################################
# UCSD/LI Nimblegen Hela
# Data submitted on hg17 for June freeze

    cd /cluster/data/encode/UCSD/nimblegen/2005-06-01
    foreach f (lab/Nim*/*.wig)
        set t =  `echo $f:t:r | sed -e \
         's/rnap/encodeUcsdNgHeLaRnap/; s/tmh3k4/encodeUcsdNgHeLaH3K4me3/;'`
        echo $t
        grep "^chr" $f | hgLoadBed -onServer -bedGraph=4 hg17 $t stdin
        checkTableCoords hg17 $t
    end
        # Produces 4 tables, encodeUcsdNgHeLa{Rnap,H3K4me3}_p{0,30}
        # Loaded 385149 elements of size 4 

# UCSD/Ludwig Institute Nimblegen chip/chip (2005-10-07 KATE)
#   New data submission
# New data 2006-12-04 by Keith Ching <keching@ucsd.edu)

    ssh hgwdev
    cd /cluster/data/encode/UCSD/nimblegen
    mkdir 2005-09-29
    cd 2005-09-29
    mkdir lab
    # copy file from FTP dir, unzip and untar, into lab dir
    # 12 data files and README

cat > load.csh << 'EOF'
    foreach f (`ls lab/*.wig`)
        set table = `echo $f:t:r | sed -e 's/\(.*\)/encodeUcsdNgHeLa\u\1/'`
        echo $table
        grep '^chr' $f | hgLoadBed -onServer -bedGraph=4 hg17 $table stdin
        checkTableCoords hg17 $table
    end
'EOF'
    csh load.csh >&! load.log
    # Created hg17 composite track with all 16 datasets
    # The hg16 composite only has the first 4 submitted

##########################################################################
# UCSD/LI Chip/Chip on Nimblegen and PCR platforms (2006-12-04)
# from Keith Ching <keching@ucsd.edu)
    # New data submitted 2007-03-01 
    # New data submitted 2007-05-23
    # New data submitted 2007-05-29
    # Mar07 data resubmitted 2007-08-28
    # Aug07 data submitted 2007-08-28
    # NOTE: some of this data is resubmissions of existing tracks
    # in slightly different formats.  For hg17, I will load two new
    # tracks -- one for Nimblegen platform, the other for PCR platform.
    # For hg18, the older tracks will be merged in, with duplicate
    # datasets removed.
    # Working... 2007-08-10 (kate)

    ssh kkstore03
    cd /cluster/data/encode/UCSD
    mkdir 2006-12-04/lab
    cd 2006-12-04/lab
# files: encode.{nimblegen,pcr}.wig.gz, encode_{nimblegen,pcr}_desc.html
# Nimblegen data (9 tracks): 6 histones, Pol2, TAF1, p300 in HeLa
# PCR data (8 tracks): CTCF, 2 histones, TAF1 in HeLa, U937, GM, and IMR90
# All data submitted on hg17, although the PCR array is hg16.
# The Ng data was overlap-clipped by the submitter to 38bp regions,
# starting at the beginning of the first 50mer (later submissions were
# clipped starting at the 6th base of the first 50mer).

# candidates for merging into existing tracks: LI Ng gIF, LI Chip Various
# (asking submitters 2006-12-20 kate)
/cluster/data/encode/bin/scripts/splitTracks.pl lab/encode.nimblegen.wig
# total lines read: 3466350, track declarations: 9, data lines: 3466341
cat > load.ng.csh << 'EOF'
foreach f (t0 t1 t2 t3 t4 t5 t6 t7 t8)
set ab = `sed -n '/track/s/.*name=\(.*\)_0 description.*/\1/p' $f`
    set d = HeLa$ab
    mv $f $d.wig
    set table = encodeUcsdLiNg$d
    echo $table
    grep '^chr' $d.wig | hgLoadBed -onServer -bedGraph=4 hg17 $table stdin
    checkTableCoords hg17 $table
    end
    'EOF'
    csh load.ng.csh >&! load.ng.log &
#Loaded 385149 elements of size 4
#encodeUcsdLiNgHeLaH3
#encodeUcsdLiNgHeLaH3ac
#encodeUcsdLiNgHeLaH4ac
#encodeUcsdLiNgHeLaH3K4me1
#encodeUcsdLiNgHeLaH3K4me2
#encodeUcsdLiNgHeLaH3K4me3
#encodeUcsdLiNgHeLaTAF1
#encodeUcsdLiNgHeLaRNAPII
#encodeUcsdLiNgHeLap300

    /cluster/data/encode/bin/scripts/splitTracks.pl lab/encode.pcr.wig
# total lines read: 196240, track declarations: 8, data lines: 196232
    cat > load.pcr.csh << 'EOF'
foreach f (t0 t1 t2 t3 t4 t5 t6 t7)
    set ab = `sed -n '/track/s/.*name=\([^_][^_]*\).*/\1/p' $f`
    set cell = `sed -n "/track/s/.*name=.*_\(.*\) description.*/\1/p" $f`
    set d = ${cell}${ab}
    mv $f $d.wig
    set table = encodeUcsdLiPcr$d
    echo $table
    grep '^chr' $d.wig | hgLoadBed -onServer -bedGraph=4 hg17 $table stdin
    checkTableCoords hg17 $table
    end
'EOF'
    csh load.pcr.csh >&! load.pcr.log &
    # Loaded 24529 elements of size 4
    # encodeUcsdLiPcrGM06990CTCF
    # encodeUcsdLiPcrHeLaCTCF
    # encodeUcsdLiPcrU937CTCF
    # encodeUcsdLiPcrGM06990H3K4me1
    # encodeUcsdLiPcrGM06990H3K4me3
    # encodeUcsdLiPcrHeLaH3K4me3
    # encodeUcsdLiPcrIMR90H3K4me3
    # encodeUcsdLiPcrGM06990TAF1

    # NOTE: GM/CTCF data was also submitted in May.
    # Keith Ching advises dropping this version of the data and
    # keeping May.

    hgsql hg17 -e "drop table encodeUcsdLiPcrGM06990CTCF"

    # New data submitted 2007-03-01 
    # Data includes various cell lines and antibodies using both PCR and 
    # Nimblegen and there are no gamma interferon treatments. 
    # Nimblegen data (11 tracks): 3 histones, Pol2, TAF2, CTCF in GM06990
    #                             4 histones, CTCF in HeLa
    # PCR data (20 tracks): histones, CTCF, TAFII, p300 in GM, K562, HeLa, IMR90, Tonsil
    # NOTE: PCR data submitted in hg16 coords
    # NOTE: For PCR data there are 4 tracks with the same label (TAF2/p250)
    # and 3 tracks with another (p300_C).  The name generator was confused by underscore
    # in antibody name. 
    # This data was resubmitted on 8/28 (see below)
    cd /cluster/data/encode/UCSD
    mkdir -p 2007-03-01/lab
    cd 2007-03-01/lab
    # load data from FTP site
    cd ..
    /cluster/data/encode/bin/scripts/splitTracks.pl lab/nimblegen.wig
    # total lines read: 4236650, track declarations: 11, data lines: 4236639
cat > load.ng.csh << 'EOF'
    foreach f (t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10)
        set ab = `sed -n '/track/s/.*name=\([^_()][^_()]*\).*/\1/p' $f`
        set cell = `sed -n "/track/s/.*name=.*_\(.*\)_Chromatin description.*/\1/p" $f`
        set d = ${cell}${ab}
        mv $f $d.wig
        set table = encodeUcsdLiNg$d
        echo $table
        grep '^chr' $d.wig | hgLoadBed -onServer -bedGraph=4 hg17 $table stdin
        checkTableCoords hg17 $table
    end
'EOF'
    csh load.ng.csh >&! load.ng.log &
    # Loaded 385149 elements of size 4
    # encodeUcsdLiNgGM06990TAFII
    # encodeUcsdLiNgGM06990H3K4me3
    # encodeUcsdLiNgGM06990H3K27Ac
    # encodeUcsdLiNgGM06990RNAPII
    # encodeUcsdLiNgGM06990CTCF
    # encodeUcsdLiNgGM06990H3K18Ac
    # encodeUcsdLiNgHeLaCTCF
    # encodeUcsdLiNgHeLaH3K18Ac
    # encodeUcsdLiNgHeLaH3K27Ac
    # encodeUcsdLiNgHeLaH3K9Ac
    # encodeUcsdLiNgHeLaH3K27me3

    # New data submitted 2007-05-23
    #  PCR data (7 tracks):  Histones, TAF2, Pol2, CTCF in GM and K562
    #  NOTE: submitted in hg16 coords
    # Lifted and reloaded 2007-09-05 (kate)
    mkdir -p 2007-05-23/lab
    cd 2007-05-23/lab
    cp /var/ftp/encode/ucsc.zip .
    unzip ucsc.zip
    wc -l *.wig
  # 24538 ave_H3K18Ac_GM.rst.wig
  # 24538 ave_H3K27Ac_GM.rst.wig
  # 24538 ave_H3K9Ac_GM.rst.wig
  # 24538 ave_H3K9Ac_K562.rst.wig
  # 24538 ave_RNAPII_GM.rst.wig
  # 24538 ave_TAF250_GM.rst.wig
  # 24538 ave_ctcf_gm.rst.wig

    cd ..
cat > load.pcr.csh << 'EOF'
    foreach f (lab/*.wig)
        set ab = `sed -n '/track/s/.*name=\([^_][^_]*\).*/\1/p' $f`
        set cell = `sed -n "/track/s/.*name=.*_\(.*\) description.*/\1/p" $f`
        set d = ${cell}${ab}
        set table = encodeUcsdLiPcr$d
        echo $table
        grep '^chr' $f | liftOver stdin \
                /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain.gz \
                $d.hg17.bedGraph $d.unmapped
        hgLoadBed -onServer -bedGraph=4 hg17 $table $d.hg17.bedGraph
        checkTableCoords hg17 $table
    end
'EOF'
    csh load.pcr.csh >&! load.pcr.log &
    # Loaded 24528 elements of size 4
    # NOTE: dropped 9 elements on chrX
    # encodeUcsdLiPcrGMH3K18Ac
    # encodeUcsdLiPcrGMH3K27Ac
    # encodeUcsdLiPcrGMH3K9Ac
    # encodeUcsdLiPcrK562H3K9Ac
    # encodeUcsdLiPcrGMRNAPII
    # encodeUcsdLiPcrGMTAF250
    # encodeUcsdLiPcrgmctcf

    # rename for consistency
    hgsql hg17 -e "alter table encodeUcsdLiPcrgmctcf rename to encodeUcsdLiPcrGMCTCF"

    # New data submitted 2007-05-29
    # Nimblegen data (10 tracks): TAF, histones in GM and K562
    mkdir -p 2007-05-29/lab
    cd 2007-05-29/lab
    # load data from FTP site
    cd ..
cat > load.ng.csh << 'EOF'
    foreach f (lab/*.wig)
        set ab = `sed -n '/track/s/.*name=\([^_()][^_()]*\).*/\1/p' $f`
        set cell = `sed -n "/track/s/.*name=.*_\(.*\)_Chromatin description.*/\1/p" $f`
        set d = ${cell}${ab}
        mv $f $d.wig
        set table = encodeUcsdLiNg$d
        echo $table
        grep '^chr' $d.wig | hgLoadBed -onServer -bedGraph=4 hg17 $table stdin
        checkTableCoords hg17 $table
    end
'EOF'
    csh load.ng.csh >&! load.ng.log &
    # Loaded 385149 elements of size 4
    # encodeUcsdLiNgK562TAFII
    # encodeUcsdLiNgK562H3K4me1
    # encodeUcsdLiNgK562H3K27Ac
    # encodeUcsdLiNgK562H3K18Ac
    # encodeUcsdLiNgK562H3K4me3
    # encodeUcsdLiNgK562H3K4me2
    # encodeUcsdLiNgK562H3K9Ac
    # encodeUcsdLiNgGM06990H3K4me1
    # encodeUcsdLiNgGM06990H3K9Ac
    # encodeUcsdLiNgGM06990H3K4me2

    # Mar07 data resubmitted 2007-08-28
    # 20 sets of PCR data, submitted in hg16 coords
    # Loaded 2007-09-04 (kate)
    mkdir -p 2007-08-28-Mar/lab
    cd 2007-08-28-Mar/lab
    mv /var/ftp/encode/UCSC200702.zip .
    unzip UCSC200702.zip
    wc -l *.wig
   #24538 ave_CTCF_K562.rst.wig
   #24538 ave_H2AZ_HeLa.rst.wig
   #24538 ave_H3K18Ac_K562.rst.wig
   #24538 ave_H3K27Ac_HeLa.rst.wig
   #24538 ave_H3K27Ac_K562.rst.wig
   #24538 ave_H3K4me1_IMR90.rst.wig
   #24538 ave_H3K4me1_K562.rst.wig
   #24538 ave_H3K4me2_GM06990.rst.wig
   #24538 ave_H3K4me2_K562.rst.wig
   #24538 ave_H3K4me3_K562.rst.wig
   #24538 ave_H4K20me1_HeLa.rst.wig
   #24538 ave_H4K20me2_HeLa.rst.wig
   #24538 ave_H4K20me3_HeLa.rst.wig
   #24538 ave_TAF250_GM06990.rst.wig
   #24538 ave_TAF250_HeLa.rst.wig
   #24538 ave_TAF250_K562.rst.wig
   #24538 ave_TAF250_Tonsil.rst.wig
   #24538 ave_p300_GM06990.rst.wig
   #24538 ave_p300_IMR90.rst.wig
   #24538 ave_p300_K562.rst.wig
    cd ..
    # confirmed that no datasets below will overwrite previously
    # loaded tables (no resubmissions).  See oldtables.txt and tables.txt
cat > load.pcr.csh << 'EOF'
    foreach f (lab/*.wig)
        set ab = `sed -n '/track/s/.*name=\([^_][^_]*\).*/\1/p' $f`
        set cell = `sed -n "/track/s/.*name=.*_\(.*\) description.*/\1/p" $f`
        set d = ${cell}${ab}
        set table = encodeUcsdLiPcr$d
        echo $table
        grep '^chr' $f | liftOver stdin \
                /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain.gz \
                $d.hg17.bedGraph $d.unmapped
        hgLoadBed -onServer -bedGraph=4 hg17 $table $d.hg17.bedGraph
        checkTableCoords hg17 $table
    end
'EOF'
    csh load.pcr.csh >&! load.pcr.log &
    # Loaded 24528 elements of size 4
    # NOTE: 9 elements on chrX dropped by lift
        # encodeUcsdLiPcrK562CTCF
        # encodeUcsdLiPcrHeLaH2AZ
        # encodeUcsdLiPcrK562H3K18Ac
        # encodeUcsdLiPcrHeLaH3K27Ac
        # encodeUcsdLiPcrK562H3K27Ac
        # encodeUcsdLiPcrIMR90H3K4me1
        # encodeUcsdLiPcrK562H3K4me1
        # encodeUcsdLiPcrGM06990H3K4me2
        # encodeUcsdLiPcrK562H3K4me2
        # encodeUcsdLiPcrK562H3K4me3
        # encodeUcsdLiPcrHeLaH4K20me1
        # encodeUcsdLiPcrHeLaH4K20me2
        # encodeUcsdLiPcrHeLaH4K20me3
        # encodeUcsdLiPcrGM06990TAF250
        # encodeUcsdLiPcrHeLaTAF250
        # encodeUcsdLiPcrK562TAF250
        # encodeUcsdLiPcrTonsilTAF250
        # encodeUcsdLiPcrGM06990p300
        # encodeUcsdLiPcrIMR90p300
        # encodeUcsdLiPcrK562p300

    # NOTE: GM/TAF data was also in the May submission.
    # Keith Ching advises dropping the March submission and
    # keeping the May

    hgsql hg17 -e "drop table encodeUcsdLiPcrGM06990TAF250"

    # Aug07 data submitted 2007-08-28
    # Loaded 2007-09-05 (kate)
    # PCR data: 4 datasets (YY1 and p300 in HeLa, H4Ac in K562, CTCF in IMR90)
    # Submitted in hg16 coordinates
    mkdir -p 2007-08-28/lab
    cd 2007-08-28/lab
    mv /var/ftp/encode/encode200708.zip
    unzip encode200708.zip
  # 24538 ave_CTCF_IMR90.rst.wig
  # 24538 ave_H4Ac_K562.rst.wig
  # 24538 ave_YY1_HeLa.rst.wig
  # 24538 ave_p300_HeLa.rst.wig
  # 61 encode_pcr_desc.html

    cd ..
cat > load.pcr.csh << 'EOF'
    foreach f (lab/*.wig)
        set ab = `sed -n '/track/s/.*name=\([^_][^_]*\).*/\1/p' $f`
        set cell = `sed -n "/track/s/.*name=.*_\(.*\) description.*/\1/p" $f`
        set d = ${cell}${ab}
        set table = encodeUcsdLiPcr$d
        echo $table
        grep '^chr' $f | liftOver stdin \
                /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain.gz \
                $d.hg17.bedGraph $d.unmapped
        hgLoadBed -onServer -bedGraph=4 hg17 $table $d.hg17.bedGraph
        checkTableCoords hg17 $table
    end
'EOF'
    csh load.pcr.csh >&! load.pcr.log &
    # Loaded 24528 elements of size 4
    # encodeUcsdLiPcrIMR90CTCF
    # encodeUcsdLiPcrK562H4Ac
    # encodeUcsdLiPcrHeLaYY1
    # encodeUcsdLiPcrHeLap300

    # data distribution for 2007 PCR data
     grep ^chr *12-04/lab/*pcr*wig *Mar/*.hg17.bedGraph *5-23/*.hg17.bedGraph *-08-28/*.hg17.bedGraph | awk '{print $4}' | sort -nr | head
    # 102.5771
    # 38.3034
    # 27.3306
    # 26.7377
    # 24.0476

     grep ^chr *12-04/lab/*pcr*wig *Mar/*.hg17.bedGraph *5-23/*.hg17.bedGraph *-08-28/*.hg17.bedGraph | awk '{print $4}' | sort -nr | grep -v ^102 | textHistogram stdin -real -binSize=.2 -maxBinCount=50
     large values truncated: need 191 bins or larger binSize than 0.2

    0.000000 ***************************** 153175
    0.200000 ***** 27298
    0.400000 **** 18398
    0.600000 ************ 64313
    0.800000 ************************************************************ 313021
    1.000000 ************************************************* 257011
    1.200000 ************ 63866
    1.400000 **** 22781
    1.600000 ** 9400
    1.800000 * 6465
    2.000000 * 3904
    2.200000 * 4096
    2.400000  2151
    2.600000  1658
    2.800000  1313

     grep ^chr *12-04/lab/*nimb*wig *-03-01/*.wig *5-29 | awk '{print $4}' | sort -nr | head
     #4.48
     #4.440
     #4.40
     #4.319

     # NOTE: multiple submissions for the same experiment were provided:
     # CTCF in GM: 12/06, 5/07
     # TAF1 in GM: 12/06, 3/07, 5/07
     # Asked Keith Ching  -- he says keep only the 5/7 versions

     hgsql hg17 -e "drop table encodeUcsdLiPcrGM06990CTCF"
     hgsql hg17 -e "drop table encodeUcsdLiPcrGM06990TAF1"
     hgsql hg17 -e "drop table encodeUcsdLiPcrGM06990TAF250"

##########################################################################
# UT-Austin (Vishy Iyer lab) Chip/chip  (2005-10-10 kate)
    cd /cluster/data/encode
    mkdir UTexas/2005-10-01/lab
    cd UTexas/2005-10-01/lab
    # copy file from FTP dir
    # 8 .wig data files (4 experiments, with raw data, and "peaks"), plus description file

cat > load.csh << 'EOF'
    foreach f (`ls lab/*.wig`)
        set table = `echo $f:t:r | sed -e 's/HeLa/HeLa_NoSerum/;s/NoSerum//;s/Serum4hr/Stim/;s/2091/2091fib/;s/\(.*\)_\(.*\)_\(.*\)_\(.*\)/encodeUtexChip\1\3\2\u\4/'`
        echo $table
        grep '^chr' $f | hgLoadBed -onServer -bedGraph=4 hg17 $table stdin
    checkTableCoords hg17 $table
    end
'EOF'
    csh load.csh >&! load.log
    # Created composite track with 8 subtracks

    
##########################################################################
# Affy Chip/chip and RNA (kate)
# submitted by Hari_Tammana@affymetrix.com (Oct. 3)
#  with clarifications as to display from Phil Kapranov at Affy
# HeLa data update submitted 12/15 by Hari Tamani

    cd /cluster/data/encode/Affy
    mkdir 2005-10-03/lab
    cd 2005-10-03/lab
    # copy file from FTP dir (500M) affy_oct1.tar.gz
    # two data dirs: CHIP, RNA
    # 10 descriptions for CHIP dir, 3 for RNA dir
    # RNA has 2 dirs (bed, wig) with each
    #   having 3 cell lines (GM06990, HeLa, HL60; the HL60
    #   data has 4 timepoints (0, 2, 8, 32)
    #  README's (and discussions with Phil) indicate the 
    #     wig's are replacements for previous RNA Signal
    #     data, and bed's are replacement Transfrags
    # The CHIP .wig files are similar to the previous
    #   Affy Pval data, but analyzed with stricter analysis
    #   criteria.  The .bed files are comparable to the Sites
    #  track.  2 factors are repeats from previous track
    #   (HisH4 TetraAc, Pol2), and 3 are new (H3K9K14DiAc, 
    # p63_ActD (with Actinomycin D treatment), 
    # p63_mActD (without Actinomycin D treatment)
    #  The Pol2, HisH4, and H3* data are at 4 timepoints.
    # These should be loaded in addition to previous tracks
    #  (not replacements).  Later the earlier ("lenient")
    #  analysis will be submitted for the 3 new factors,
    #  and these will be added to the previous Affy Chip/chip tracks
    #  on hg17.

    # Transfrags (6 subtracks)
    cd /cluster/data/encode/Affy/2005-10-03
    tail +2 lab/RNA/bed/GM06990/EC_AS_GM06990_RCyP+_C01vsNULL.sig.gr.bed \
    | hgLoadBed -noBin hg17 encodeAffyRnaGm06990Sites stdin
        # 4377 elements
    tail +2 lab/RNA/bed/HeLa/EC_AS_HeLaS3_RCyP+_C01vsNULL.sig.gr.bed \
    | hgLoadBed -noBin hg17 encodeAffyRnaHeLaSites stdin
        # 2037 elements
cat > loadSites.csh << 'EOF'
    foreach f (lab/RNA/bed/HL60/??/*HL60*.bed)
      set track = `echo $f:t:r:r:r | perl -wpe \
        's/EC_AS_HL60_RWP\+_RA_(\d+)hr_C01vsNULL/encodeAffyRnaHl60SitesHr$1/;'`
      echo $track
      tail +2 $f \
        | hgLoadBed -noBin hg17 $track stdin
    end    
'EOF'
    csh loadSites.csh >&! loadSites.log

    # Update HeLa sites (12/15)
    cd /cluster/data/encode/Affy/2005-11-22
    tail +2 lab//Affy_HeLa/bed/EC_AS_HeLa_RCyP+_C01vsNULL.sig.gr.bed | \
        hgLoadBed -strict -noBin hg17 encodeAffyRnaHeLaSites stdin
            # 7254 elements

    # RNA Signal (6 subtracks)
    set gbdbDir = /gbdb/hg17/encode/Affy/2005-10-03
    mkdir -p $gbdbDir/wib
    mkdir wib wig

    set track = encodeAffyRnaGm06990Signal
    cat lab/RNA/wig/GM06990/EC_AS_GM06990_RCyP+_C01vsNULL.sig.wig \
    | wigEncode stdin wig/$track.wig wib/$track.wib
    ln -s `pwd`/wib/$track.wib $gbdbDir/wib/
    nice hgLoadWiggle hg17 $track wig/$track.wig -pathPrefix=$gbdbDir

    set track = encodeAffyRnaHeLaSignal
    cat lab/RNA/wig/HeLa/EC_AS_HeLa_RCyP+_C01vsNULL.sig.wig \
    | wigEncode stdin wig/$track.wig wib/$track.wib
    ln -s `pwd`/wib/$track.wib $gbdbDir/wib/
    nice hgLoadWiggle hg17 $track wig/$track.wig -pathPrefix=$gbdbDir

cat > loadSig.csh << 'EOF'
    set gbdbDir = /gbdb/hg17/encode/Affy/2005-10-03
    foreach f (lab/RNA/wig/HL60/??/*HL60*C01vsNULL.sig.wig)
      set track = `echo $f:t:r:r:r | perl -wpe \
       's/EC_AS_HL60_RWP\+_RA_(\d+)hr_C01vsNULL/encodeAffyRnaHl60SignalHr$1/;'`
      echo $track
      cat $f \
      | wigEncode stdin wig/$track.wig wib/$track.wib
      ln -s `pwd`/wib/$track.wib $gbdbDir/wib/
      nice hgLoadWiggle hg17 $track wig/$track.wig -pathPrefix=$gbdbDir
    end
'EOF'
    csh loadSig.csh >&! loadSig.log
    # Create a single composite track for RNA and Transfrags

    # Update HeLa signal (2005-12-15 kate)
    cd /cluster/data/encode/Affy/2005-11-22
    mkdir wib wig
    set gbdbDir = /gbdb/hg17/encode/Affy/2005-10-03
    rm $gbdbDir/wib/$track.wib
    set track = encodeAffyRnaHeLaSignal
    cat lab/Affy_HeLa/wig/EC_AS_HeLa_RCyP+_C01vsNULL.sig.wig | \
        wigEncode stdin wig/$track.wig wib/$track.wib
            # Converted stdin, upper limit 1591.50, lower limit -779.75
    ln -s `pwd`/wib/$track.wib $gbdbDir/wib/
    nice hgLoadWiggle hg17 $track wig/$track.wig -pathPrefix=$gbdbDir

    # CHIP/Chip sites (2005-10-24 kate)
    cd /cluster/data/encode/Affy/2005-10-03
    # Load up 12 tables of ChIP/chip sites at (3 factors, 4 timepoints)
    # plus 2 more for ActD at 1 timepoint
cat > loadChipBed.csh << 'EOF'
    foreach f (lab/CHIP/bed/*/??/*.bed)
      set factor = `echo $f:h:h:t | sed 's/Pol2/Rnap/; s/Hish4/H4Kac4/'`
      set hr = $f:h:t
      set table = encodeAffyChIpHl60SitesStrict${factor}Hr$hr
      echo $table
      grep "^chr" $f | hgLoadBed -noBin hg17 $table stdin
    end
    grep "^chr" lab/CHIP/bed/p63_ActD/*.bed | hgLoadBed -noBin hg17 \
                encodeAffyChIpHl60SitesStrictP63_ActD stdin
    grep "^chr" lab/CHIP/bed/p63_mActD/*.bed | hgLoadBed -noBin hg17 \
                encodeAffyChIpHl60SitesStrictP63_mActD stdin
'EOF'
    csh loadChipBed.csh >&! loadChipBed.log

    # Chip/chip signal and pvalue 
cat > loadChipWig.csh << 'EOF'
    set gbdbDir = /gbdb/hg17/encode/Affy/2005-10-03
    foreach d (lab/CHIP/wig/p63_ActD lab/CHIP/wig/p63_mActD)
        set factor = $d:t
        set prefix = encodeAffyChIpHl60;
        set track = ${prefix}SignalStrict$factor
        echo $track
        cat $d/*.sig.median.wig \
            | wigEncode stdin wig/$track.wig wib/$track.wib
        ln -s `pwd`/wib/$track.wib $gbdbDir/wib/
        nice hgLoadWiggle hg17 $track wig/$track.wig -pathPrefix=$gbdbDir
        set track = ${prefix}PvalStrict$factor
        echo $track
        cat $d/*.pval.median.wig \
            | wigEncode stdin wig/$track.wig wib/$track.wib
        ln -s `pwd`/wib/$track.wib $gbdbDir/wib/
        nice hgLoadWiggle hg17 $track wig/$track.wig -pathPrefix=$gbdbDir
    end
    foreach d (lab/CHIP/wig/*/??)
        set hr = $d:t
        set factor = $d:h:t
        set track = ${prefix}SignalStrict${factor}Hr$hr
        echo $track
        cat $d/*.sig.median.wig \
            | wigEncode stdin wig/$track.wig wib/$track.wib
        ln -s `pwd`/wib/$track.wib $gbdbDir/wib/
        nice hgLoadWiggle hg17 $track wig/$track.wig -pathPrefix=$gbdbDir
        set track = ${prefix}PvalStrict${factor}Hr$hr
        echo $track
        cat $d/*.pval.median.wig \
            | wigEncode stdin wig/$track.wig wib/$track.wib
        ln -s `pwd`/wib/$track.wib $gbdbDir/wib/
        nice hgLoadWiggle hg17 $track wig/$track.wig -pathPrefix=$gbdbDir
    end
'EOF'
    csh loadChipWig.csh >&! loadChipWig.log 
    # create 2 composite tracks:
    #  Affy Strict ChIP  (contains Oct freeze Sites and Pval subtracks)
    #  Affy Strict Sig  (contains Oct freeze Signal) 
    # and reformat June freeze tracks (2 composites w/ 10 factors each) as:
    #  Affy Loose ChIP  (contains Jun freeze Sites and Pval subtracks)

##########################################################################
# U North Carolina FAIRE  (2005-10-24 kate)
#       Peaks data updated (2006-04-13 kate and 2006-05-01, hartera)
# Added description for updated Peaks data - provided by 
# Paul Giresi (paulg@email.unc.edu) (2006-06-13, hartera)
# Finished data update for ChIPOTle peaks track and updatd the downloads
# and changed the original Signal and Peaks subtracks to a BED graph
# so that data in tables looks more like the raw data (on request of 
# Paul Giresi) (DONE, 2006-08-16, hartera)
    # submitted by Paul Giresi, from Jason Lieb's lab
    # later, Paul submitted an "averages" file for the
    # raw data (but doesn't include the "peaks")
    # On 10/24, submitted peaks averages.
    # The averages files are:
    # FAIREavg_data.gff (for Signal, averages of all four replicates)
    # FAIREavg_peaks.gff (for Peaks, data after running peak-finding software
    # on the Signal averages data above).
    # Both of these files are in wiggle format. 
    cd /cluster/data/encode
    mkdir UNC/2005-10-10/lab
    cd UNC/2005-10-10/lab
    # copy files from FTP dir
    # 8 .gff data files plus description file
    # NOTE: these are actually .bed and .wig files
    # the .bed files are "peaks", and the .wig are "raw"
    # NOTE: these files are basically replicates,
    # we really want to show just the averages -- 
    # Submitter says OK to just post for download

    mkdir -p download
    # convert to UNIX format
    foreach f (lab/*norm*.gff)
        set t = $f:t:r
        echo $t
        dos2unix -n $f download/$t.bed
    end
    # slightly different format for "peaks" files
    foreach f (lab/*fpr01*.gff)
        set t = $f:t:r
        echo $t
        dos2unix -l -n $f download/$t.bed
    end
    cd download
    gzip *.bed
    md5sum *.bed > md5sum.txt
    # add README file with data terms

    ssh hgwdev
    set dir = /usr/local/apache/htdocs/goldenPath/hg17/encode/datafiles
    mkdir -p $dir
    ln -s /cluster/data/encode/UNC/2005-10-10/download $dir/UncFaire

    # averages 
    # Probes are 50 bp with 12 bp overlap and the start of each spot on 
    # the chromosomes were listed. Changing the span to 38 bp removed the
    # overlap. This should only have been done for the Signal and not Peaks
    # data (from e-mail from Paul Giresi, 2006-08-08)  
    sed 's/span=50/span=38/' lab/FAIREavg_data.gff > Signal.wig
    sed 's/span=50/span=38/' lab/FAIREavg_peaks.gff > Peaks.wig
    #_data.gff:  -2.61 to 3.63
    #_peaks.gff:   .47 to 3.63
    # using viewLimits: .2 to 2.6
    # wiggle0 with span=50
    # around 380K records, so load it wiggle, not bedGraph
cat > load.csh << 'EOF'
    foreach f (Signal.wig Peaks.wig)
        set type = $f:r
        set table = encodeUncFaire$type
        wigEncode $f $table.wig $table.wib
        set dir = /gbdb/hg17/encode/UNC/2005-10-10
        mkdir -p $dir
        hgLoadWiggle -pathPrefix=$dir hg17 $table $table.wig
        mkdir -p $dir
        ln -s `pwd`/$table.wib $dir
    end
'EOF'
    csh load.csh >&! load.log 

    # update peaks data (2006-04-13 kate)
    cd /cluster/data/encode
    mkdir UNC/2006-04-13/lab
    cd UNC/2006-04-13/lab
    # lab/OfficialChIPOTle_PEAKS.gff is a bedGraph format and this contains
    # the new peaks data after a data reanalysis.
    # lab/FAIREavg_OfficialPeaks.gff is a file in wiggle format with
    # the Signal track data first (the same as for the original track)
    # and then the new Peaks data.

    # trim precision for the peaks data:
    awk 'NR !=1 {printf("%s\t%d\t%d\t%.3f\n", $1, $2, $3, $4)}' \
        lab/OfficialChIPOTle_PEAKS.gff > peaks.bedGraph
    # data range: 0 - 3.627
    # load data as bedGraph (2006-05-01, hartera)
    # edit file and remove line: track    0       0       0.000
    # and then load
    hgLoadBed -strict -bedGraph=4 hg17 \
              encodeUncFairePeaksApr2006 peaks.bedGraph
    # added this as a new subtrack to human/trackDb.encode.ra to see what
    # it looks like. 
    # In ~/kent/src/hg/makeDb/trackDb/human/trackDb.encode.ra
    # add the following lines to the subtrack entry since it will inherit
    # from the parent track otherwise which is a wiggle type.
    # track encodeUncFairePeaksApr2006
    subTrack encodeUncFaire
    shortLabel UNC FAIRE Peaks Apr. 06
    longLabel UNC FAIRE Peaks (Formaldehyde Assisted Isolation of Regulatory Elements) Apr. 2006 Update
    noInherit on
    type bedGraph 4
    maxHeightPixels 128:16:16
    autoScale off
    windowingFunction mean
    viewLimits .2:2.6
    color 20,150,20
    altColor 50,100,50
    priority 3
    # Description update added (2006-06-13, hartera).
    # Add new description for new Peaks data subtrack from 
    # FAIRE_peaks_DESC.htm to trackDb/human/encodeUncFaire.html 
    # Data is not correct so reload the Peaks data sent by Paul Giresi
    # FAIRE_peaks1e-025_feat_track.gff in lab directory
    ssh hgwdev
    mkdir -p /cluster/data/encode/UNC/2006-05/lab
    cd /cluster/data/encode/UNC/2006-05
    # remove first line
    tail +2 lab/FAIRE_peaks1e-025_feat_track.gff > peaks.bedGraph
    # and then load as bedGraph
    hgLoadBed -strict -bedGraph=4 hg17 \
              encodeUncFairePeaksChipotle peaks.bedGraph
    # edit human/trackDb.encode.ra entry above to use May2006 table.
    # edit these lines too:
    # track encodeUncFairePeaksChipotle
    # longLabel UNC FAIRE Peaks (Formaldehyde Assisted Isolation of Regulatory
    # Elements) (ChIPOTle)
    # viewLimits 0.0:2.7
    # color 0,0,255

    # Reload the original Peaks and Signal data as BED graph so that the 
    # table downloads look more like the original data so that the number
    # of lines in the table is the same as the number of peaks for the Peaks
    # track. Used original data with span=50. This is the size of the probes.
    cd /cluster/data/encode/UNC/2005-10-10
    mkdir -p bedGraphFormat
    cd bedGraphFormat
    /cluster/bin/scripts/varStepToBedGraph.pl ../lab/FAIREavg_data.gff \
           > signal.bedGraph
    # Processed 385194 lines input, 385149 data lines, 44 variable step
    # declarations 
    /cluster/bin/scripts/varStepToBedGraph.pl ../lab/FAIREavg_peaks.gff \
           > peaksOriginal.bedGraph
    # Processed 845 lines input, 800 data lines, 44 variable step declarations
    # Reload the Signals and Peaks tables with this data.
    hgsql -e "drop table encodeUncFaireSignal;" hg17
    hgsql -e "drop table encodeUncFairePeaks;" hg17
    
    hgLoadBed -strict -bedGraph=4 hg17 \
              encodeUncFaireSignal signal.bedGraph
    hgLoadBed -strict -bedGraph=4 hg17 \
              encodeUncFairePeaks peaksOriginal.bedGraph
    # update human/trackDb.encode.ra so that type is
    # type bedGraph 4
    # for the parent track.
    cd /cluster/data/encode/UNC/2005-10-10/lab
    cp FAIREavg_data.gff ../download/FAIREavg_data.wig
    cp FAIREavg_peaks.gff ../download/FAIREavg_peaks.wig
    cd ../download
    gzip *.wig
    # change the Signal files to wig extension as these are wiggle format
    foreach f (*CHR.bed.gz)
       set g=$f:r:r
       mv $f ${g}.wig.gz
    end
    # Add ChIPOTle data to downloads. 
    cp /cluster/data/encode/UNC/2006-05/lab/FAIRE_peaks1e-025_feat_track.gff \
       FAIRE_peaks1e-025_feat_track.bed
    gzip FAIRE_peaks1e-025_feat_track.bed 
    # Add description of these files to README.txt and update the md5sum.txt
    rm md5sum.txt
    md5sum *.gz > md5sum.txt
   
    # Look at data using histogram to help decide viewLimit:
    cd /cluster/data/encode/UNC/2005-10-10/bedGraphFormat
    textHistogram -binSize=0.1 -maxBinCount=65 -col=4 -minVal=-2.7 -real \
                  signal.bedGraph > signal.hist
    textHistogram -binSize=0.1 -maxBinCount=40 -col=4 -real \
                  peaksOriginal.bedGraph > peaksOriginal.hist
    textHistogram -binSize=0.1 -maxBinCount=40 -col=4 -real \
                  ../../2006-05/peaks.bedGraph > peaksChipotle.hist
    # set minLimit, maxLimit, viewLimit and increased default pixel size for 
    # subtrack so that y axis scale is shown, in human/trackDb.encode.ra:
    # maxHeightPixels 128:24:16
    # minLimit -2.61
    # maxLimit 3.63
    # viewLimits -0.6:0.7
    # for the Peaks subtracks, the viewLimits were set as:
    # viewLimits 0.4:3.7

##########################################################################
# Gencode Genes (2005-10-10 kate)
#    Files are on Gencode/IMIM web site, our contact for this round is France Denoed
#    France requested 3 subtracks: genes, putatives, and pseudogenes        
# NTOE: reloaded encodeGencodeKnown from updated _genes_ file 10/14 (kate)

# Update 2007-03-28 (Kate). Received from Julien Lagarde (jlagarde@imim.es)
# Julien requested 5 subtracks: reference genes, putative, polymorphic,
# pseudogenes, polyA features. Track update (DONE, 2007-04-14, hartera)
# Renamed the gencodeGeneClassOct05 table to encodeGencodeGeneClassOct05
# (DONE, 2007-09-09, hartera)
    cd /cluster/data/encode
    mkdir -p Gencode/2005-10-07/lab
    cd Gencode/2005-10-07/lab

    wget ftp://genome.imim.es/pub/other/gencode/data/havana-encode/current/44regions/README
    wget ftp://genome.imim.es/pub/other/gencode/data/havana-encode/current/44regions/44regions_genes_CHR_coord.gtf
    wget ftp://genome.imim.es/pub/other/gencode/data/havana-encode/current/44regions/44regions_putative_CHR_coord.gtf
    wget ftp://genome.imim.es/pub/other/gencode/data/havana-encode/current/44regions/44regions_pseudogenes_CHR_coord.gtf

    cd ..
    ldHgGene -gtf -genePredExt hg17 encodeGencodeKnown \
        lab/44regions_genes_CHR_coord.gtf

            # Read 2637 transcripts in 45565 lines in 1 files
            # 2637 groups 21 seqs 13 sources 5 feature types
            # 2608 gene predictions
    genePredCheck -db=hg17 encodeGencodeKnown

    ldHgGene -gtf -genePredExt hg17 encodeGencodePutative lab/44regions_putative_CHR_coord.gtf
            # 156 gene predictions
    genePredCheck -db=hg17 encodeGencodePutative

    ldHgGene -gtf -genePredExt hg17 encodeGencodePseudo lab/44regions_pseudogenes_CHR_coord.gtf
            # 197 gene predictions
    genePredCheck -db=hg17 encodeGencodePutative
    # create composite track: "Gencode Oct Gene" with 3 subtracks

    # Introns track
    grep intron lab/*.gtf | wc -l
        # 25421
    # ignore "not tested" introns
    grep intron lab/*.gtf | grep -v not_tested | wc -l
        # 483
    # NOTE: need verision of loader with new status value added
    cat lab/*.gtf | grep -v not_tested | sed -e 's/-intron/-/g' | \
        ~/bin/i386/ldGencodeIntron hg17 encodeGencodeIntronOct stdin
            # 483 introns in 1 files

    # create gene class table
    sed 's/gencodeGeneClass/gencodeGeneClassOct/' \
        ~/kent/src/hg/lib/gencodeGeneClass.sql | hgsql hg17
    cat lab/*.gtf | grep VEGA | \
        awk '{printf "%s\t%s\n", $10, $2}' | \
        sed -e 's/"//g' -e 's/;//' -e 's/VEGA_//' \
            -e 's/_val/_gencode_conf/' -e 's/Antisense/Novel_transcript/' | \
        sort | uniq > gencodeGeneClassOct.tab
    wc -l gencodeGeneClassOct.tab
        #  2961
    echo "LOAD DATA LOCAL INFILE 'gencodeGeneClassOct.tab' into table gencodeGeneClassOct" | hgsql hg17

    # Rename the gencodeGeneClassOct05 table so that is has the prefix
    # "encode" in line with all other ENCODE tables. (20007-09-09, hartera)
    hgsql -e \
   'alter table gencodeGeneClassOct05 rename encodeGencodeGeneClassOct05;' hg17
    # Make the change in trackDb/human/trackDb.encode.ra so that the
    # itemClassTbl is encodeGencodeGeneClassOct05 for Gencode Genes Oct05.

    ######################################################################
    # Update 2007-03-28 from Julien Lagarde <jlagarde@imim.es>(Kate)
    # Track update with five subtracks (in progress, 2007-04-10, hartera).
    # Gencode reference genes: "Known" and "Novel_CDS"
    # Gencode putative: "Novel_Transcript", "Putative", "TEC", "Artifact"
    # Gencode polymorphic: "Polymorphic"
    # Gencode pseudogenes: "Processed_pseudogene", "Unprocessed_pseudogene"
    # Gencode polyA features: "polyA_signal", "polyA_site", "pseudo_polyA"
    # cut -f2 *chr_coords_hg17.gff | sort | uniq shows all the types.
    # New description sent 2007-04-12, ucsc_description2.html
    # Update finished 2007-04-14, hartera.
    # Renamed the gencodeGeneClassMar07 table to encodeGencodeGeneClassMar07
    # (DONE, 2007-09-09, hartera)
    cd /cluster/data/encode/Gencode
    mkdir -p 2007-03-28/lab
    ln -s 2007-03-28 latest
    cd 2007-03-28/lab
    wget -r -nv -nd -np ftp://genome.imim.es/pub/projects/gencode/data/havana-encode/current/gff/EN\*hg17.gff
    cd /cluster/data/encode/Gencode/2007-03-28
    egrep -h 'Known|Novel_CDS' lab/*.gff > encodeGencodeGeneKnownMar07.gff
    egrep -h 'Novel_Transcript|Putative|TEC|Artifact' lab/*.gff \
          > encodeGencodeGenePutativeMar07.gff
    egrep -h 'Polymorphic' lab/*.gff > encodeGencodeGenePolymorphicMar07.gff
    egrep -h 'pseudogene' lab/*.gff > encodeGencodeGenePseudoMar07.gff
    egrep -h 'polyA' lab/*.gff | \
          awk 'BEGIN {FS="\t"} {OFS="\t"} {print $1, $4, $5, $2, "0", $7;}' | \
          sed -e 's/VEGA_//' \
          > encodeGencodeGenePolyA.bed
    wc -l encode*
    # total is 33492 which is the same number of lines in the lab/*.gff files.
    
    # load these into the database:
    foreach c (Known Putative Polymorphic Pseudo)
        set table = encodeGencodeGene${c}Mar07
        echo $table
        ldHgGene -genePredExt hg17 $table ${table}.gff             
        genePredCheck -db=hg17 $table
    end
    # encodeGencodeGeneKnownMar07 - 2991 gene predictions
    # encodeGencodeGenePutativeMar07 - 372 gene predictions
    # encodeGencodeGenePolymorphicMar07 - 25 gene predictions
    # encodeGencodeGenePseudoMar07 - 191 gene predictions

    # everything looks fine with genePredCheck. A bug was fixed by Mark
    # in ldHgGene since there was an error with loading initially. Tables
    # were reloaded on 2007-04-12.
 
    # The BED table can have an itemRgb column to specify coloring for items.
    # polyA_signal: brown, polyA_site: orange, pseudo_polyA: pink
    # Add the colours in r,g,b format in column 9:
    awk 'BEGIN {OFS="\t"} {if ($4 ~ /signal/) print $0, "0", "0", "94,38,5"; \
        else if ($4 ~ /site/) print $0, "0", "0", "255,102,0"; \
        else if ($4 ~ /pseudo/) print $0, "0", "0", "255,153,255";}' \
        encodeGencodeGenePolyA.bed > encodeGencodeGenePolyAMar07.bed 
    # Load in the polyA features BED file:
    hgLoadBed hg17 encodeGencodeGenePolyAMar07 encodeGencodeGenePolyAMar07.bed
    # Loaded 1807 elements of size 6

    # Create the gene class table:
    # modify table name and add Polymorhpic class to enum:
    sed -e 's/gencodeGeneClass/gencodeGeneClassMar07/' \
        ~/kent/src/hg/lib/gencodeGeneClass.sql \
        > gencodeGeneClassMar07.sql
    perl -pi.bak -e \
    "s/Unprocessed_pseudogene\'/Unprocessed_pseudogene\', \'Polymorphic\'/" \
         gencodeGeneClassMar07.sql
    rm *.bak

    cat lab/*.gff | grep VEGA | grep -v polyA | \
        awk '{printf "%s\t%s\n", $10, $2}' | \
        sed -e 's/"//g' -e 's/;//' -e 's/VEGA_//' | \
        sort | uniq > gencodeGeneClassMar07.tab
    wc -l gencodeGeneClassMar07.tab
    # 3579 gencodeGeneClassMar07.tab
    # load into database:
    hgLoadSqlTab hg17 gencodeGeneClassMar07 gencodeGeneClassMar07.sql \
                 gencodeGeneClassMar07.tab

    # Create a human/trackDb.encode.ra entry and the description page.
    cp -p ucsc_description.html \
       ~/kent/src/hg/makeDb/trackDb/human/encodeGencodeGeneMar07.html 

    # E-mailed Julien Lagarde on 2007-04-13 to ask if a Gencode Introns
    # track update is also required. E-mail from Julien on 2007-04-16 states
    # that the old Gencode Introns track represented RT-PCR verification of 
    # individual exon junctions. This information has now integrated into gene
    # objects by HAVANA annotators so the Introns track is now obsolete -
    # this is also stated in the track description.

    # Rename the gencodeGeneClassMar07 table so that is has the prefix
    # "encode" in line with all other ENCODE tables. (20007-09-09, hartera)
    hgsql -e \
   'alter table gencodeGeneClassMar07 rename encodeGencodeGeneClassMar07;' hg17
    # Make the change in trackDb/human/trackDb.encode.ra so that the
    # itemClassTbl is encodeGencodeGeneClassMar07 for Gencode Genes Mar07.

##########################################################################
# NHGRI DNaseI HS (2005-10-24 kate)
#       Submitter: Greg Crawford
#       2 datasets:  CD4, GM06690 with different methodology from previous
#       Additional (raw) data for both cell types submitted 12/6/05

# Additional data submitted 8/10/06: Raw & Pval for HelaS3 and GM cells
# Submitted new PVAL (3 cell lines) data on 8/11
# Additional data submission from Greg (at Duke now) 9/22/06  -- HepG2 cell line
# Additional data (DNAse array )for IMR90, K562 and H9 cells 
#       (raw and pvalue) submitted 2/26/07
# Track update to add subtracks for IMR90, K562 and H9 cells (Raw and Pval for 
# DNase-chip method). Old Method subtracks (for GM06990 and CD4+ T cells) were 
# removed and any references to them in the description were also removed.
# (DONE, 2004-04-10, hartera)
    cd /cluster/data/encode/NHGRI/crawford
    mkdir -p 2005-10-11/lab
    cd 2005-10-11/lab
    # copy 2 data files from FTP site

    # lift to hg17
    ln -s lab/Crawford_DNase_chip_CD4_hg16.txt Cd4.hg16.bed
    awk '{printf "%s\t%s\t%s\t%s\t%s\n", $1,$2,$3,$4,$6}' \
        lab/Crawford_DNase_chip_GM06990_hg16.txt > Gm06990.hg16.bed
    # oops - mistakenly deleted lab/Crawford*txt files
cat > load.csh << 'EOF'
    foreach f (Cd4.hg16.bed Gm06990.hg16.bed)
        set cell = $f:r:r
        liftOver $cell.hg16.bed \
                 /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain \
                        $cell.hg17.bed $cell.unmapped
        hgLoadBed hg17 encodeNhgriDnaseHsChip$cell $cell.hg17.bed
    end
'EOF'
    csh load.csh >&! load.log

    # Add these two tracks to the hg17 NHGRI DNase track
    # Rename 2 data tables lifted from hg16
    hgsql hg17 -e "ALTER TABLE encodeNhgriDnaseHsAct RENAME TO encodeNhgriDnaseHsMpssCd4Act"
    hgsql hg17 -e "ALTER TABLE encodeNhgriDnaseHsNonAct RENAME TO encodeNhgriDnaseHsMpssCd4"

    # Raw data
    ln -s lab/NHGRI_DNase_chip_CD4_na_RAW.bed Cd4.raw.hg16.bed
    ln -s lab/NHGRI_DNase_chip_GM_RAW.bed  Gm06990.raw.hg16.bed
cat > loadRaw.csh << 'EOF'
    foreach f (Cd4.raw.hg16.bed Gm06990.raw.hg16.bed)
        set cell = $f:r:r:r
        liftOver $f \
                 /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain \
                        $cell.raw.hg17.bed $cell.raw.unmapped
        hgLoadBed -strict -bedGraph=4 hg17 \
                encodeNhgriDnaseHsChipRaw$cell $cell.raw.hg17.bed
    end
'EOF'
    csh loadRaw.csh >&! loadRaw.log
    # Loaded 382713 elements of size 4

    # 8/10/06 Data submission
    cd /cluster/data/encode/NHGRI/crawford
    mkdir -p 2006-08-10/lab
    cd 2006-08-10/lab
    cp -p /var/ftp/encode/Crawford* .
    ls
# Crawford_DNase-chip_GM06990_PVAL_hg17.bed
# Crawford_DNase-chip_GM06990_RAW_HG17.bed
# Crawford_DNase-chip_HeLaS3_PVAL_HG17.bed
# Crawford_DNase-chip_HeLaS3_RAW_hg17.bed

    # NOTE: the GM Raw data is identical to that submitted on 10/11/05
    set cell = Gm06990
    awk '{print $1, $2, $3, $5}' lab/Crawford_DNase-chip_GM06990_RAW_HG17.bed \
        > $cell.raw.hg17.bed
    hgLoadBed -strict -bedGraph=4 hg17 \
                encodeNhgriDnaseHsChipRaw$cell $cell.raw.hg17.bed
    # Loaded 382713 elements of size 4
    set cell = Hela
    awk '{print $1, $2, $3, $5}' lab/Crawford_DNase-chip_HeLaS3_RAW_hg17.bed \
        > $cell.raw.hg17.bed
    hgLoadBed -strict -bedGraph=4 hg17 \
                encodeNhgriDnaseHsChipRaw$cell $cell.raw.hg17.bed
    # Loaded 385149 elements of size 4
    # Note: different item count from data for CD4 and GM

# Submitted new PVAL data on 8/11
#Crawford_DNase-chip_CD4_PVAL_hg17.bed10
#Crawford_DNase-chip_GM06990_PVAL_hg17.bed10
#Crawford_DNase-chip_HeLaS3_PVAL_hg17.bed12

  ln -s lab/Crawford_DNase-chip_GM06990_PVAL_hg17.bed10 Gm06990.pval.hg17.bed
  ln -s lab/Crawford_DNase-chip_HeLaS3_PVAL_hg17.bed12 Hela.pval.hg17.bed
  ln -s lab/Crawford_DNase-chip_CD4_PVAL_hg17.bed10 Cd4.pval.hg17.bed

# load Pval data as bed5floatscore, with pval mapped to integer score (0-1000)
# for display purposes
# format: chr start end name score pVal
cat > loadPval.csh << 'EOF'
    foreach cell (Cd4 Gm06990 Hela)
        set lcell = `echo $cell | sed 's/\(.*\)/\L\1/'`
        awk -v CELL=$lcell '/^chr/ {printf("%s\t%d\t%d\t%s_%d\t%d\t%.3f\n", $1, $2, $3, CELL, NR-1, $5 * 35 + 100, $5)}' $cell.pval.hg17.bed > $cell.pval.bed5+
        set table = encodeNhgriDnaseHsChipPval$cell
        sed "s/bed5Pval/$table/" ~/kent/src/hg/lib/bed5Pval.sql > \
                $table.sql
        hgsql hg17 -e "DROP TABLE IF EXISTS $table"
        hgsql hg17 < $table.sql
        hgLoadBed -strict -sqlTable=$table.sql hg17 \
                $table $cell.pval.bed5+
        checkTableCoords hg17 $table
        end
'EOF'
    csh loadPval.csh >&! loadPval.log 
# min = 3.13 max  = 24.513
    # Reading Cd4.pval.bed5+
    #Loaded 1262 elements of size 6
    #Reading Gm06990.pval.bed5+
    #Loaded 1098 elements of size 6
    #Reading Hela.pval.bed5+
    #Loaded 1042 elements of size 6

# data submission 9/22/06  -- HepG2 cell line
    cd /cluster/data/encode/NHGRI/crawford
    mkdir -p 2006-09-22/lab
    cd 2006-09-22
    ln -s lab/Crawford_DNase-chip_HepG2_PVAL_HG17.bed HepG2.pval.bed
    ln -s lab/Crawford_DNase-chip_HepG2_RAW_hg17.bed HepG2.raw.bed

    set cell = HepG2

    # load RAW
    awk '{print $1, $2, $3, $5}' $cell.raw.bed | \
                hgLoadBed -strict -bedGraph=4 hg17 \
                        encodeNhgriDnaseHsChipRaw$cell stdin
        # Loaded 385149 elements

    # load PVAL
    set lcell = hepg2
    awk -v CELL=$lcell '/^chr/ {printf("%s\t%d\t%d\t%s_%d\t%d\t%.3f\n", $1, $2, $3, CELL, NR, $5 * 35 + 100, $5)}' $cell.pval.bed > $cell.pval.bed5+
    set table = encodeNhgriDnaseHsChipPval$cell
    sed "s/bed5Pval/$table/" ~/kent/src/hg/lib/bed5Pval.sql > \
            $table.sql
    hgsql hg17 -e "DROP TABLE IF EXISTS $table"
    hgsql hg17 < $table.sql
    hgLoadBed -strict -sqlTable=$table.sql hg17 \
            $table $cell.pval.bed5+
    checkTableCoords hg17 $table

# Additional data (DNAse array )for IMR90, K562 and H9 cells 
#       (raw and pvalue) submitted 2/26/07

    cd /cluster/data/encode
    ln -s NHGRI/crawford Duke
    cd Duke
    mkdir -p 2007-02-26/lab

    # copy files from FTP area
    cd 2007-02-26
    # (DONE, 2007-04-03, hartera)
    ln -s lab/Crawford_DNase-chip_H9_pvalue.bed H9.pval.bed
    ln -s lab/Crawford_DNase-chip_H9_RAW_HG17.bed H9.raw.bed
    ln -s lab/Crawford_DNase-chip_IMR90_pvalue.bed Imr90.pval.bed
    ln -s lab/Crawford_DNase-chip_IMR90_RAW_HG17.bed Imr90.raw.bed
    ln -s lab/Crawford_DNase-chip_K562_pvalue.bed K562.pval.bed
    ln -s lab/Crawford_DNase-chip_K562_RAW_HG17.bed K562.raw.bed

    # load RAW (-strict is now default for hgLoadBed)
    foreach cell (H9 Imr90 K562)
        awk '{print $1, $2, $3, $5}' $cell.raw.bed | \
                hgLoadBed -bedGraph=4 hg17 \
                        encodeNhgriDnaseHsChipRaw$cell stdin
    end
    # check table coordinates for raw data:
     foreach cell (H9 Imr90 K562)
         set table = encodeNhgriDnaseHsChipRaw$cell
         echo $table
         checkTableCoords hg17 $table
     end
  
    # load PVAL
    foreach cell (H9 Imr90 K562)
        set lcell = `echo $cell | sed 's/\(.*\)/\l\1/'`
        awk -v CELL=$lcell '/^chr/ {printf("%s\t%d\t%d\t%s_%d\t%d\t%.3f\n", $1, $2, $3, CELL, NR, $5 * 35 + 100, $5)}' $cell.pval.bed > $cell.pval.bed5+
        set table = encodeNhgriDnaseHsChipPval$cell
        sed "s/bed5Pval/$table/" ~/kent/src/hg/lib/bed5Pval.sql > \
                $table.sql
        hgsql hg17 -e "DROP TABLE IF EXISTS $table"
        hgsql hg17 < $table.sql
        hgLoadBed -sqlTable=$table.sql hg17 $table $cell.pval.bed5+
        echo "Checking coords in $table"
        checkTableCoords hg17 $table
    end
    # Added the new subtracks to the hg17/trackDb.encode.ra and updated the
    # description.
    # (2007-04-10, hartera)
    # GM06990 and CD4+ T cell subtracks done by the old method were removed from
    # trackDb/human/hg17/trackDb.encode.ra and also any references to them 
    # in the description. Tables for these subtracks are: 
    # encodeNhgriDnaseHsChipGm06990 and encodeNhgriDnaseHsChipCd4.

##########################################################################
# Sanger Chip/chip Hits and Centers (2005-10-24 kate)
# From Paul Flicek, at EBI 
# 14 files (3 cells, most with 5 factors), each file having
    # 3 tracks:  chip/chip, HMM regions, HMM centers
    # Christoph says to just display the HMM regions & centers tracks
    # from Paul's files. These were generated from June freeze
    # chip/chip, plus newly submitted HeLa data from Christoph Koch (10/7).
    cd /cluster/data/encode/sanger/chipchip
    mkdir -p 2005-10-18/lab
    cd 2005-10-18/lab

    # HeLa chip/chip
cat > loadHela.csh << 'EOF'
    foreach f (lab/*_HeLa-S3_1.wig.txt)
        set b = `echo $f:t:r:r | sed 's/-S3_1//; s/_//'`
        echo $b
        grep "^chr" $f | sort -k1,1 -k2,2n > chip.$b.wig
        hgLoadBed -bedGraph=4 hg17 encodeSangerChip$b chip.$b.wig 
    end
'EOF'
    csh loadHela.csh >&! loadHela.log &

    # split HMM tracks out of files
cat > load.csh << 'EOF'
    foreach f (lab/*.split.wig.txt)
        set b = `echo $f:t:r:r:r | sed 's/-2//; s/-//g'`
        echo $b
        /cluster/data/encode/bin/scripts/splitTracks.pl $f
        rm t0
        grep '^chr' t1 | sort -k1,1 -k2,2n > $b.wig; rm t1
        hgLoadBed -bedGraph=4 hg17 encodeSangerChipHit$b $b.wig
        checkTableCoords hg17 encodeSangerChipHit$b
        grep '^chr' t2 | sed 's/	1$//' > $b.bed; rm t2
        hgLoadBed -noBin hg17 encodeSangerChipCenter$b $b.bed
        checkTableCoords hg17 encodeSangerChipCenter$b
    end
'EOF'
    csh load.csh >&! load.log 
    
#############################################################################
#  Measuring TARs and TransFrags distances to SINEs and LINEs
#
#
#	Using the table browser on genome.ucsc.edu on Hg17, select the
#	Alu SINEs and L1,L2 LINEs by setting filter at:
#	repClass=LINE or SINE
#	repFamily=L1, L2 or Alu
#	request fields: swScore, genoName, genoStart, genoEnd, repNames
#	save to file L1_LINE_Hg17.txt.gz, L2_LINE_Hg17.txt.gz
#	Alu_SINE_Hg17.txt.gz
##########################################################################
# UW/Regulome DnaseI HS (2005-10-28, 11-17 kate)
# NOTE: trimmed overlaps in baseline files, as per Scott Kuehn

    cd /cluster/data/encode/Regulome
    mkdir -p 2005-11-16
    cd 2005-11-16
cat > load.csh << 'EOF'
    foreach cell (CACO2 CD34 GM HeLa HepG2 Huh7 K562 SKNSH)
        echo $cell
        hgLoadBed -noBin -strict hg17 \
            encodeRegulomeQuality$cell lab/$cell.qc.bed
        hgLoadBed -noBin -strict hg17 \
            encodeRegulomeAmplOdd$cell lab/$cell.oddAmps.bed
        hgLoadBed -noBin -strict hg17 \
            encodeRegulomeAmplEven$cell lab/$cell.evenAmps.bed
        hgLoadBed -noBin -strict -bedGraph=5 hg17 \
            encodeRegulomeProb$cell lab/$cell.hs.bed
        sort -k1,1 -k2,2n lab/$cell.baseline.bed | \
            /cluster/data/encode/bin/scripts/trimOverlap.pl | \
            hgLoadBed -noSort -noBin -strict -bedGraph=5 hg17 \
                encodeRegulomeBase$cell stdin
    end
'EOF'
    csh load.csh >&! load.log &

##########################################################################
# UC Davis Chip/chip (new C-Myc data) (2005-10-29 kate)
# Add as subtrack to existing track
# New datafiles for hits (c-Myc and E2F1) submitted 2006-10-24
# by Mark Bieda <mcbieda@ucdavis.edu>

    # convert to bedGraph
    cd /cluster/data/encode/UcDavis/2005-10-12
    set table = encodeUCDavisChipMyc
    awk '{printf "%s\t%s\t%s\t%s\n", $1,$4,$5,$6}' lab/myc_median.gff | \
	sort -k1,1 -k2,2n > $table.bed
    hgLoadBed -strict -bedGraph=4 hg17 $table $table.bed
        # Loaded 385149 elements

    # hits data
    # 2 files: E2F1_HelaFIGS_T02P0001S50G2CHR.gff  myc_helafix_hg17_T02P0001S50G2CHR.gff
    # NOTE: E2F1 data submitted in hg16 coords
    # Load as bed 5 -- generating item names from <chr>_<start>
    #  at recommendation of Mark Bieda
    cd /cluster/data/encode/UcDavis
    mkdir -p 2006-10-24/lab
    # copy files from FTP dir
    cd 2006-10-24
    awk '{printf "%s\t%d\t%d\t%s_%s\t%d\n", $1,$4,$5,$1,$4,$6}' \
        lab/E2F1_HelaFIGS_T02P0001S50G2CHR.gff > e2f1.hg16.bed
    liftOver e2f1.hg16.bed \
        /cluster/data/encode/convertHg17/hg16ToHg17.over.chain.gz \
                e2f1.hg17.bed e2f1.unmapped
    # 1 unmapped (in ENm006)
    #  chrX    152137876       152138192       chrX_152137876  2
    hgLoadBed -strict hg17 encodeUcDavisChipHitsE2F1 e2f1.hg17.bed
        # Loaded 204 elements of size 5 
    checkTableCoords hg17 encodeUcDavisChipHitsE2F1

    awk '{printf "%s\t%d\t%d\t%s_%s\t%d\n", $1,$4,$5,$1,$4,$6}' \
        lab/myc_helafix_hg17_T02P0001S50G2CHR.gff > myc.hg17.bed
    hgLoadBed -strict hg17 encodeUcDavisChipHitsMyc myc.hg17.bed
        # Loaded 172 elements of size 5
    checkTableCoords hg17 encodeUcDavisChipHitsMyc

    # NOTE: drop old tables after review

##########################################################################
# UC Davis Chip/chip (new data, PolII and Taf in GM and HelaS3 cells) 
# (2007-05-02 ting)
# Add as subtrack to existing track
# New datafiles submitted 2007-3-23
# by Mark Bieda <mcbieda@ucdavis.edu>
# 
# 

    # convert to bedGraph
    cd /cluster/data/encode/UcDavis/2007-03-23
    
    awk '{printf "%s\t%d\t%d\t%f\n", $1,$4,$5,$6}' lab/GM_POLII_qmed_m3.gff \
	  > encodeUCDavisPolII_GM.bed
    awk '{printf "%s\t%d\t%d\t%f\n", $1,$4,$5,$6}' lab/GM_Taf_qmed_m3.gff \
	  > encodeUCDavisTaf_GM.bed
    awk '{printf "%s\t%d\t%d\t%f\n", $1,$4,$5,$6}' lab/HelaS3_POL_qmed_m3.gff \
	  > encodeUCDavisPolII_HelaS3.bed
    awk '{printf "%s\t%d\t%d\t%f\n", $1,$4,$5,$6}' lab/HelaS3_Taf_qmed_m3.gff \
	  > encodeUCDavisTaf_HelaS3.bed
	      
    hgLoadBed -bedGraph=4 hg17 encodeUCDavisPolII_GM \
	  encodeUCDavisPolII_GM.bed
	  # Loaded 385149 elements of size 4
    # Sorted
    # Creating table definition for encodeUCDavisPolII_GM
	
    hgLoadBed -bedGraph=4 hg17 encodeUCDavisTaf_GM \
	  encodeUCDavisTaf_GM.bed
    # Loaded 385149 elements of size 4
    # Sorted
    # Creating table definition for encodeUCDavisTaf_GM

    hgLoadBed -bedGraph=4 hg17 encodeUCDavisPolII_HelaS3 \
	  encodeUCDavisPolII_HelaS3.bed
	  # Loaded 385149 elements of size 4
    # Sorted
    # Creating table definition for encodeUCDavisPolII_HelaS3

    hgLoadBed -bedGraph=4 hg17 encodeUCDavisTaf_HelaS3 \
	  encodeUCDavisTaf_HelaS3.bed
	  # Loaded 385149 elements of size 4
    # Sorted
    # Creating table definition for encodeUCDavisTaf_HelaS3
	  
	  checkTableCoords hg17 encodeUCDavisPolII_GM
	  checkTableCoords hg17 encodeUCDavisPolII_HelaS3
	  checkTableCoords hg17 encodeUCDavisTaf_GM
	  checkTableCoords hg17 encodeUCDavisTaf_HelaS3

# Note: currently these 4 tables are subtracks under encodeUCDavisChip

# Change table names since we don't want "_" there. --ting, 062707
    hgsql hg17 -e "ALTER TABLE encodeUCDavisPolII_GM RENAME TO encodeUCDavisPolIIGM;"
    hgsql hg17 -e "ALTER TABLE encodeUCDavisPolII_HelaS3 RENAME TO encodeUCDavisPolIIHelaS3;"
    hgsql hg17 -e "ALTER TABLE encodeUCDavisTaf_GM RENAME TO encodeUCDavisTafGM;"
    hgsql hg17 -e "ALTER TABLE encodeUCDavisTaf_HelaS3 RENAME TO encodeUCDavisTafHelaS3;"

# Release note, --ting, 072407
# encodeUCDavisChip track was replaced by encodeUcDavisChipHits before Nature paper publication.
# With new raw data submitted, it was decided to release the encodeUCDavisChip track with both
# old data and new data. 
# Note: Mark and Peggy will provide new Hits data that go in parallel with the raw data.
# (2007-7-24) The following subtracks for encodeUCDavisChip were released: (Ann)
# encodeUCDavisPolIIGM
# encodeUCDavisPolIIHelaS3
# encodeUCDavisTafGM
# encodeUCDavisTafHelaS3
# encodeUCDavisE2F1Median
# encodeUCDavisChipMyc
#


##########################################################################
# Yale TAR and TransMap (2005-10-31 kate)
# Submitted: 10/14 by Joel Rozowsky
# 5 bed files (TARs) and 5 wig files (Signal)
# Replacements for June tracks (and drop individual 10 Neu samples)
# Methods changed somewhat -- use new description from Joel's email
# NOTE: adjusted start coord -1 to coorespond to their DART entries --
# verifying with Joel

# new data submitted 3/20/07 by guoneng.zhong@yale.edu (kate)

    cd /cluster/data/encode/yale/rna/2005-10-14
cat > loadBed.csh << 'EOF'
    foreach f (lab/*.bed)
        set table = `echo $f:t:r | sed 's/_//g; s/CTRL/Untr/; s/ncbi35//; s/Placenta/Plac/; s/Neutrophil/Neut/'`
        echo $table
        sed 's/http.*acc=//' $f | \
            awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $4}' | \
                hgLoadBed -strict hg17 $table stdin
    end
'EOF'
    csh loadBed.csh >&! loadBed.log 

# NOTE: trim overlaps in regions resulting from array design
# Joel should have done this for us -- he will verify the files.
# Also, need to adjust coords +1 as per J. Rozowsky
cat > loadSig.csh << 'EOF'
    mkdir -p wig wib
    set gdir = /gbdb/hg17/encode/YaleRna/2005-10-14
    mkdir -p $gdir/wib
    foreach f (lab/*.wig)
        set table = `echo $f:t:r | sed 's/_//g; s/Transcript/Trans/; s/CTRL/Untr/; s/ncbi35//; s/Placenta/Plac/; s/Neutrophil/Neut/'`
        echo $table
        grep "^chr" $f | \
            awk '{printf "%s\t%d\t%d\t%s\n", $1, $2+1, $3+1, $4}' | \
            sort -k1,1 -k2,2n | \
            /cluster/data/encode/bin/scripts/trimOverlap.pl > $table.trim
        wigEncode $table.trim wig/$table.wig wib/$table.wib
        hgLoadWiggle -pathPrefix=$gdir hg17 $table wig/$table.wig
        ln -s `pwd`/wib/$table.wib $gdir/wib
    end
'EOF'
    csh loadSig.csh >&! loadSig.log 
    rm -f *.trim

    # restoring Neutrophil table which somehow got dropped from hgwdev
    # 2006-01-04 kate
    set table = encodeYaleAffyNeutRNATransMap
    wigEncode $table.trim wig/$table.wig wib/$table.wib
    # Converted encodeYaleAffyNeutRNATransMap.trim, upper limit 3275.25, lower limit -2658.03
    set gdir = /gbdb/hg17/encode/YaleRna/2005-10-14
    hgLoadWiggle -pathPrefix=$gdir hg17 $table wig/$table.wig
    

    # post wig downloads
    ssh kkstore03
    cd /cluster/data/encode/yale/rna/latest
    mkdir downloads
    foreach f (*.trim)
        set table = ($f:r)
        echo $table
        gzip -c $f > downloads/$table.bedGraph.gz
    end

    # new data submitted 3/20/07 by guoneng.zhong@yale.edu (kate)
    ssh kkstore03
    cd /cluster/data/encode/yale/rna
    mkdir -p 2007-03-20/lab
    cd 2007-03-20/lab
    mv /var/ftp/encode/joel_affy.tgz
    tar xvfrz joel_affy.tgz

    #	Loaded into hg17 2007-06-11 - Hiram
    cd /cluster/data/encode/yale/rna/2007-03-20
    ln -s \
      lab/joel_affy/encode_Yale_Affy_HELAS3_PolyA_RNA_Tars_Relaxed_ncbi35.bed \
	./encodeYaleAffyHELAS3PolyARNATarsRelaxed.bed
    ln -s \
    lab/joel_affy/encode_Yale_Affy_HELAS3_PolyA_RNA_Tars_Stringent_ncbi35.bed \
	./encodeYaleAffyHELAS3PolyARNATarsStringent.bed
    ln -s \
    lab/joel_affy/encode_Yale_Affy_HELAS3_PolyA_RNA_Transcript_Map_ncbi35.wig \
	./encodeYaleAffyHELAS3PolyARNATranscriptMap_lab.wig
    ln -s \
      lab/joel_affy/encode_Yale_Affy_HELAS3_Total_RNA_Tars_Relaxed_ncbi35.bed \
	./encodeYaleAffyHELAS3TotalRNATarsRelaxed.bed
    ln -s \
    lab/joel_affy/encode_Yale_Affy_HELAS3_Total_RNA_Tars_Stringent_ncbi35.bed \
	./encodeYaleAffyHELAS3TotalRNATarsStringent.bed
    ln -s \
    lab/joel_affy/encode_Yale_Affy_HELAS3_Total_RNA_Transcript_Map_ncbi35.wig \
	./encodeYaleAffyHELAS3TotalRNATranscriptMap_lab.wig
#	It looks like the start coordinates in the .wig are now correct
#	0-relative, but they still need to be trimmed to not overlap.
#	And previously, both start and end were bumped one, these ends
#	have not been bumped one.
#	And they appear to have a bunch of extra blanks around the tab
#	separators, clean them up with the sed.
    for F in ./encodeYaleAffyHELAS3PolyARNATranscriptMap_lab.wig \
	./encodeYaleAffyHELAS3TotalRNATranscriptMap_lab.wig
do
    T=`echo $F | sed -e "s/_lab//"`
    sed -e "s/ //g" ${F} | sort -k1,1 -k2,2n \
            | /cluster/data/encode/bin/scripts/trimOverlap.pl > ${T}.trim
done

    wigEncode encodeYaleAffyHELAS3PolyARNATranscriptMap.wig.trim \
        encodeYaleAffyHELAS3PolyARNATranscriptMap.wig \
        encodeYaleAffyHELAS3PolyARNATranscriptMap.wib
    wigEncode encodeYaleAffyHELAS3TotalRNATranscriptMap.wig.trim \
        encodeYaleAffyHELAS3TotalRNATranscriptMap.wig \
        encodeYaleAffyHELAS3TotalRNATranscriptMap.wib
    hgLoadWiggle -pathPrefix=/gbdb/hg17/wib hg17 \
        encodeYaleAffyHELAS3PolyARNATranscriptMap \
        encodeYaleAffyHELAS3PolyARNATranscriptMap.wig
    ln -s  `pwd`/encodeYaleAffyHELAS3PolyARNATranscriptMap.wib \
        /gbdb/hg17/wib/encodeYaleAffyHELAS3PolyARNATranscriptMap.wib 
    hgLoadWiggle -pathPrefix=/gbdb/hg17/wib hg17 \
        encodeYaleAffyHELAS3TotalRNATranscriptMap \
        encodeYaleAffyHELAS3TotalRNATranscriptMap.wig
    ln -s  `pwd`/encodeYaleAffyHELAS3TotalRNATranscriptMap.wib \
        /gbdb/hg17/wib/encodeYaleAffyHELAS3TotalRNATranscriptMap.wib 


rm -f encodeYaleAffyHELAS3PolyARNATarsRelaxed.bed
rm -f encodeYaleAffyHELAS3PolyARNATarsStringent.bed
rm -f encodeYaleAffyHELAS3TotalRNATarsRelaxed.bed
rm -f encodeYaleAffyHELAS3TotalRNATarsStringent.bed
awk '{printf "%s\t%s\t%s\tsite_%d\n", $1,$2,$3,NR}' \
lab/joel_affy/encode_Yale_Affy_HELAS3_PolyA_RNA_Tars_Relaxed_ncbi35.bed \
> encodeYaleAffyHELAS3PolyARNATarsRelaxed.bed
awk '{printf "%s\t%s\t%s\tsite_%d\n", $1,$2,$3,NR}' \
lab/joel_affy/encode_Yale_Affy_HELAS3_PolyA_RNA_Tars_Stringent_ncbi35.bed \
> encodeYaleAffyHELAS3PolyARNATarsStringent.bed
awk '{printf "%s\t%s\t%s\tsite_%d\n", $1,$2,$3,NR}' \
lab/joel_affy/encode_Yale_Affy_HELAS3_Total_RNA_Tars_Relaxed_ncbi35.bed \
> encodeYaleAffyHELAS3TotalRNATarsRelaxed.bed
awk '{printf "%s\t%s\t%s\tsite_%d\n", $1,$2,$3,NR}' \
lab/joel_affy/encode_Yale_Affy_HELAS3_Total_RNA_Tars_Stringent_ncbi35.bed \
> encodeYaleAffyHELAS3TotalRNATarsStringent.bed

    for F in encodeYaleAffyHELAS3PolyARNATarsRelaxed \
	encodeYaleAffyHELAS3PolyARNATarsStringent \
	encodeYaleAffyHELAS3TotalRNATarsRelaxed \
	encodeYaleAffyHELAS3TotalRNATarsStringent
do
    hgLoadBed hg17 ${F} ${F}.bed
done


##########################################################################
# Yale Chip/chip (2005-10-31 kate)
# Final submission: 10/26 by Zhengdong Zhang
# signal, pval, and sites for 5 factors (50x38 array)
# Sites file has URL to Gerstein lab as 5th field.
# I'm extracting the accession from it and saving
# as the name field in a BED5.  Score range: .602-3.23
#  Scale *330 produces integer range 200-1000.
# NOTE: >50% of sites are < 1 data value, so use 200 as low score

# New data submitted 2006-11-29 by guoneng.zhong@yale.edu
# 6 datasets -- Pol2 (2 antibodies) and H3K4ac in HeLa S3 and GM06990 cells
# Loaded into P-Value and Signal tracks data into database 
# Loaded into database (DONE, 2006-12-19, kate)

# New data submitted 2007-01-30 by guoneng.zhong@yale.edu
# 7 datasets, 5 factors in HeLa S3 and K562 cells
# Loaded into P-Value and Signal tracks data into database 
# (DONE, 2007-06-02 - 2007-06-03, hartera)

# Resubmitted hits data for Nov06 and Jan07 - this is for the Sites track
# submitted 2007-03-20 by guoneng.zhong@yale.edu
# Loaded Sites data for Nov06 and Jan07 (DONE, 2007-06-23, hartera)

# New data submitted 2007-03-27 by guoneng.zhong@yale.edu
# 3 datasets, 3 factors in HeLa S3 cells

# New data submitted 2007-06-06 by guoneng.zhong@yale.edu
# This is the same as the data from 2007-03-27 (April 2007 batch) but it 
# has only one *-signal.wig file for each directory and additionally, there is
# now P-value data. 
# Prepared and loaded P-value and Signal data 
# (DONE, 2007-06-06 - 2007-06-07, hartera)
# Loaded Sites (hits) data (DONE, hartera, 2007-06-23) 

# New data submitted 2007-06-15 by guoneng.zhong@yale.edu
# 1 dataset, 1 factor in HeLa S3 cells
# Loaded P-value and Signal data (2007-06-18, hartera)
# Loaded Sites (hits) data (DONE, hartera, 2007-06-23) 

awk '{print $4}' *_?/*-hits.bed | sort -n | textHistogram -real -binSize=.5 stdin
0.500000 ************************************************************ 628
1.000000 ************** 148
1.500000 ** 25
2.000000 ***************** 173
2.500000 ** 24
3.000000  1

# signal data dist:
0.000000 ************************************************************ 1782529
0.500000 *** 93246
1.000000 * 24171
1.500000  11657
2.000000  1764
2.500000  185
3.000000  13

# New data submitted 2006-11-29 by guoneng.zhong@yale.edu
# 6 datasets -- Pol2 (2 antibodies) and H3K4ac in HeLa and GM06990

    cd /cluster/data/encode/yale/chip/2005-10-26
    mkdir -p wig wib
cat > loadSites.csh << 'EOF'
    set pfx = encodeYaleChip
    foreach d (lab/{jun,fos,taf,baf155,baf170})
        set factor = $d:t
        set Factor = `echo $factor | sed 's/\(.*\)/\u\1/'`
        echo $Factor
        set p = $d/Encode_Yale_ChIpChip_${factor}_Hela_Maskless50merevery38bp

        # load sites
        set table = ${pfx}Sites$Factor
        echo $table
        set f = ${p}_Sites.bed
        dos2unix $f
        sed -e "s/bed5FloatScore/$table/" \
            $HOME/kent/src/hg/lib/bed5FloatScore.sql > $table.sql
        sed 's/=/ /' $f | \
          awk '{printf "%s\t%d\t%d\t%s\t%d\t%.3f\n",$1,$2-1,$3,$6,($4 * 330),$4}' |\
            hgLoadBed -strict -sqlTable=$table.sql hg17 $table stdin
        end
'EOF'
    csh loadSites.csh >&! loadSites.log 

cat > loadSig.csh << 'EOF'
    set pfx = encodeYaleChip
    foreach d (lab/{jun,fos,taf,baf155,baf170})
        set factor = $d:t
        set Factor = `echo $factor | sed 's/\(.*\)/\u\1/'`
        echo $Factor
        set p = $d/Encode_Yale_ChIpChip_${factor}_Hela_Maskless50merevery38bp

        # load pval 
        set table = ${pfx}Pval$Factor
        echo $table
        set f = ${p}_Pvalue.wig
        sort -k1,1 -k2,2n $f | \
            /cluster/data/encode/bin/scripts/trimOverlap.pl > $table.trim
        hgLoadBed -strict -bedGraph=4 hg17 $table $table.trim

        # load signal 
        set table = ${pfx}Signal$Factor
        echo $table
        set f = ${p}_Signal.wig
        sort -k1,1 -k2,2n $f | \
            /cluster/data/encode/bin/scripts/trimOverlap.pl > $table.trim
        hgLoadBed -strict -bedGraph=4 hg17 $table $table.trim 
    end
'EOF'
    csh loadSig.csh >&! loadSig.log &
    rm -f *.trim *.sql

    # description files
    # use server with "antiword" available
    foreach d (lab/{jun,fos,taf,baf155,baf170})
        antiword $d/*.doc > ${d:t}.txt
    end

    # New data submitted 2006-11-29 by guoneng.zhong@yale.edu
    # 6 datasets -- Pol2 (2 antibodies) and H3K4ac in HeLa and GM06990
    # Loaded into database (2006-12-19, kate)
    cd /cluster/data/encode/yale/chip
    mdkir 2006-11-29/lab
    cd 2006-11-29/lab
    tar xfz yale_batch_1.tgz

    # files are in 6 dirs, 1 per experiment
    # info.txt in experiment dir indicates antibody/cell line
    # files to load are: miyoung_?/*-*-*-hits.bed, *signal.wig, *pvalue.wig
    # pvalue and wig appear same format as previous, with overlap,
    # so use same processing as previous datasets (above)
    # Score range in hits files: 0.255 - 3.788
    # The low score is lower than previous data sets (.6), so similar
    # scaling for score (*330) will produce a near-invisible 84 score
    # Check with submitter. Sites data distribution is:
    0.000000 ******************************* 1399
    0.500000 ************************************************************ 2731
    1.000000 ********************************* 1494
    1.500000 ****** 288
    2.000000 * 62
    2.500000  18
    3.000000  6
    3.500000  2

    # signal data dist
    0.000000 ************************************************************ 2162644
    0.500000 *** 111537
    1.000000 * 24294
    1.500000  3919
    2.000000  786
    2.500000  353
    3.000000  26
    3.500000  51
    
    cd /cluster/data/encode/yale/chip/2006-11-29
    ln -s lab/miyoung_1 pol2n_hela
    ln -s lab/miyoung_2 pol2n_gm06990
    ln -s lab/miyoung_3 pol2_hela
    ln -s lab/miyoung_4 pol2_gm06990
    ln -s lab/miyoung_7 h4kac4_hela
    ln -s lab/miyoung_8 h4kac4_gm06990
cat > loadSig.csh << 'EOF'
#!/bin/csh -ef
  set pfx = encodeYaleChip
  foreach d (pol2n_hela pol2n_gm06990 pol2_hela pol2_gm06990 h4kac4_hela h4kac4_gm06990)
        set factor = `echo $d | sed 's/_.*//'`
        set cell = `echo $d | sed 's/.*_//'`
        set Factor = `echo $factor | perl -wpe 's/(.*)/\u$1/'`
        set Cell = `echo $cell | perl -wpe 's/(.*)/\u$1/'`
        echo $Factor $Cell

        # load pval 
        set table = ${pfx}Pval$Factor$Cell
        echo $table
        set f = $d/*-*-*-pvalue.wig
        sort -k1,1 -k2,2n $f | \
            /cluster/data/encode/bin/scripts/trimOverlap.pl > $table.trim
        hgLoadBed -strict -bedGraph=4 hg17 $table $table.trim

        # load signal 
        set table = ${pfx}Signal$Factor$Cell
        echo $table
        set f = $d/*-*-*-signal.wig
        sort -k1,1 -k2,2n $f | \
            /cluster/data/encode/bin/scripts/trimOverlap.pl > $table.trim
        hgLoadBed -strict -bedGraph=4 hg17 $table $table.trim 
  end
'EOF'
    loadSig.csh >&! loadSig.log &
    egrep 'trim|Loaded' *.log
        Reading encodeYaleChipPvalPol2nHela.trim
        Loaded 382721 elements of size 4
        Reading encodeYaleChipSignalPol2nHela.trim
        Loaded 382721 elements of size 4
        Reading encodeYaleChipPvalPol2nGm06990.trim
        Loaded 385149 elements of size 4
        Reading encodeYaleChipSignalPol2nGm06990.trim
        Loaded 385149 elements of size 4
        Reading encodeYaleChipPvalPol2Hela.trim
        Loaded 382721 elements of size 4
        Reading encodeYaleChipSignalPol2Hela.trim
        Loaded 382721 elements of size 4
        Reading encodeYaleChipPvalPol2Gm06990.trim
        Loaded 385149 elements of size 4
        Reading encodeYaleChipSignalPol2Gm06990.trim
        Loaded 385149 elements of size 4
        Reading encodeYaleChipPvalH4kac4Hela.trim
        Loaded 382721 elements of size 4
        Reading encodeYaleChipSignalH4kac4Hela.trim
        Loaded 382721 elements of size 4
        Reading encodeYaleChipPvalH4kac4Gm06990.trim
        Loaded 385149 elements of size 4
        Reading encodeYaleChipSignalH4kac4Gm06990.trim
        Loaded 385149 elements of size 4
    # Note size difference between Gm06990 and Hela -- inform submitter.
    # E-mail from April 4, 2007 states the following:
    # [The researcher] said that there were no real deliberate
    # reasons for the slight discrepancy. The arrays were newer so there were 
    # more features than the hela in the past.
    # Only P-Value and Signal data loaded because hits data was updated on
    # 2007-03-20.

# New data submitted 2007-01-30 by guoneng.zhong@yale.edu
# 7 datasets
# 5 factors in HeLa and K562 cells
# Prepared data and loaded into database (2007-06-02 - 2007-06-03, hartera)

    cd /cluster/data/encode/yale/chip/2007-01-30
    mkdir lab
    cd lab
    tar xvfz jan07_batch.tgz
    # files are in 7 dirs, 1 per experiment
    # info.txt in experiment dir indicates antibody/cell line
    # files to load are: miyoung_?/*-*-*-hits.bed, *signal.wig, *pvalue.wig
    # and ghia_?/*-*-*-hits.bed, *signal.wig, *pvalue.wig
    # pvalue and wig appear same format as previous, with overlap,
    # so use same processing as previous datasets (above).
    # Signal scores range: 0-3.001
    # P-values scores range: 0-20.0
    # Hits scores range: 0.223-2.681
    cd /cluster/data/encode/yale/chip/2007-01-30
    # P-values data distribution:
0.000000 ************************************************************ 1845236
2.000000 *********** 331763
4.000000 ****** 195195
6.000000 **** 127591
8.000000 *** 84006
10.000000 ** 52971
12.000000 * 29541
14.000000  12279
16.000000  0
18.000000  0
20.000000  7749

    # Signal data distribution:
0.000000 ************************************************************ 2455784
0.500000 ***** 187810
1.000000 * 33419
1.500000  8165
2.000000  1084
2.500000  68
3.000000  1
    # For ghia_8, the target is BAF155 (NOT p65 as in the ghia_8/info.txt
    # file. For miyoung_10, the antibody is sc-372 which is against the 
    # C-terminus of NFkB p65 and for miyoung_11, the antibody is sc-109 which 
    # is against the N-terminus of NFkB p65.
    
    # Create links to data:
    ln -s lab/jan07_batch/ghia_8  baf155_k562
    ln -s lab/jan07_batch/ghia_10 baf170_k562
    ln -s lab/jan07_batch/ghia_14 baf47_k562
    ln -s lab/jan07_batch/ghia_15 baf47_hela
    ln -s lab/jan07_batch/ghia_24 stat1_hela_ifna
    ln -s lab/jan07_batch/miyoung_10 p65c_hela_tnfa
    ln -s lab/jan07_batch/miyoung_11 p65n_hela_tnfa

    # format is antibody_cell except for
    # format for the last 3 is antibody_cell_treatment
    # ifna = interferon alpha, tnfa = tumor necrosis factor (TNF) alpha 
    # Load the new subtracks for Signal and P-value data: 
cat > loadSig.csh << 'EOF'
#!/bin/csh -ef
  set pfx = encodeYaleChip
  foreach d (baf155_k562 baf170_k562 baf47_k562 stat1_hela_ifna p65c_hela_tnfa p65n_hela_tnfa)
        set Stim=""
        if ($d =~ baf*) then
             set factor = `echo $d | sed 's/_.*//'`
             set cell = `echo $d | sed 's/.*_//'`
             set Factor = `echo $factor | perl -wpe 's/(.*)/\u$1/'`
             set Cell = `echo $cell | perl -wpe 's/(.*)/\u$1/'`
             echo $Factor $Cell
        else
             set factor = `echo $d | sed 's/_.*//'`
             set cell = `echo $d | perl -wpe 's/^.*_(.*)_.*$/$1/'`
             set stim = `echo $d | perl -wpe 's/.*_.*_//' `
             set Factor = `echo $factor | perl -wpe 's/(.*)/\u$1/'`
             set Cell = `echo $cell | perl -wpe 's/(.*)/\u$1/'`
             set Stim = `echo $stim | perl -wpe 's/(.*)/\u$1/'`
             echo $Factor $Cell $Stim
        endif

        # load pval 
        set table = ${pfx}Pval$Factor$Cell$Stim
        echo $table
        set f = $d/*-*-*-pvalue.wig
        sort -k1,1 -k2,2n $f | \
            /cluster/data/encode/bin/scripts/trimOverlap.pl > $table.trim
        hgLoadBed -bedGraph=4 hg17 $table $table.trim

        # load signal 
        set table = ${pfx}Signal$Factor$Cell
        echo $table
        set f = $d/*-*-*-signal.wig
        sort -k1,1 -k2,2n $f | \
            /cluster/data/encode/bin/scripts/trimOverlap.pl > $table.trim
        hgLoadBed -bedGraph=4 hg17 $table $table.trim 
  end
'EOF'
    chmod +x loadSig.csh
    loadSig.csh >&! loadSig.log &
    egrep 'trim|Loaded' *.log
Reading encodeYaleChipPvalBaf155K562.trim
Loaded 382721 elements of size 4
Reading encodeYaleChipSignalBaf155K562.trim
Loaded 382721 elements of size 4
Reading encodeYaleChipPvalBaf170K562.trim
Loaded 382721 elements of size 4
Reading encodeYaleChipSignalBaf170K562.trim
Loaded 382721 elements of size 4
Reading encodeYaleChipPvalBaf47K562.trim
Loaded 382721 elements of size 4
Reading encodeYaleChipSignalBaf47K562.trim
Loaded 382721 elements of size 4
Reading encodeYaleChipPvalStat1HelaIfna.trim
Loaded 385149 elements of size 4
Reading encodeYaleChipSignalStat1HelaIfna.trim
Loaded 385149 elements of size 4
Reading encodeYaleChipPvalP65cHelaTnfa.trim
Loaded 385149 elements of size 4
Reading encodeYaleChipSignalP65cHelaTnfa.trim
Loaded 385149 elements of size 4
Reading encodeYaleChipPvalP65nHelaTnfa.trim
Loaded 385149 elements of size 4
Reading encodeYaleChipSignalP65nHelaTnfa.trim
Loaded 385149 elements of size 4
 
    # Only P-Value and Signal data loaded because hits data was updated on
    # March 20, 2007.
    # Added trackDb entries for Nov06 and Jan07 P-value and Signal data.

# New data submitted 2007-03-27 by guoneng.zhong@yale.edu
# Started processing data (2007-06-06, hartera)

    cd /cluster/data/encode/yale/chip/2007-03-27
    mkdir lab
    cd lab
    unzip apr07_batch.zip
    cd /cluster/data/encode/yale/chip/2007-03-27

    # files are in 3 dirs, 1 per experiment
    # info.txt in experiment dir indicates antibody/cell line
    # files to load are: ghia_?/*hits_N.0.bed, *signal.wig, *pvalue.wig.
    # For hits, N is the % false discovery rate (FDR). load all of the 
    # 10% FDR files and note whether each hit appears first in the
    # 1, 5 or 10 % FDR dataset.
    # pvalue and wig appear same format as previous, with overlap,
    # so use same processing as previous datasets (above).
    # Signal data range: 0 - 4.659
    # Signal data distribution:
0.000000 ************************************************************ 1074500
0.500000 *** 62133
1.000000  7327
1.500000  1543
2.000000  1119
2.500000  529
3.000000  489
3.500000  408
4.000000  104
4.500000  11

    # Create links to data:
    ln -s lab/apr07_batch/ghia_12  smarca4_hela
    ln -s lab/apr07_batch/ghia_17  smarca6_hela
    ln -s lab/apr07_batch/ghia_25  nrsf_hela
    # Only Signal data so no P-Value data to load
    # E-mailed track contributors to ask about data and also for SMARCA4 
    # description as none in the readme.txt. Also, asked why there is no
    # P-value data and also if the hits file contains 0-based or 1-based 
    # start coordinates (2007-06-06, hartera)
    # Contributors say that the composite file (xxxxx_yyyy_zzzzz-signal.wig) 
    # for each dataset is the average of the three replicates which are in 
    # the other 3 files (xxxxx-signal.wig, yyyyy-signal.wig,
    # zzzzz-signal.wig).

# New data submitted 2007-06-06 by guoneng.zhong@yale.edu
# This is the same as the data from 2007-03-27 (April 2007 batch) but it 
# has only one *-signal.wig file for each directory and additionally, there is
# now P-value data. 
# Prepared and loaded P-value and Signal data (2007-06-06 - 2007-06-07, hartera)
    mkdir -p /cluster/data/encode/yale/chip/2007-06-06/lab
    cd /cluster/data/encode/yale/chip/2007-06-06/lab
    unzip yale-apr07_batch_b.zip
    cd /cluster/data/encode/yale/chip/2007-06-06
     
    # files are in 3 dirs, 1 per experiment
    # info.txt in experiment dir indicates antibody/cell line
    # files to load are: ghia_?/*-*-*-hits.bed, *signal.wig, *pvalue.wig
    # pvalue and wig appear same format as previous, with overlap,
    # so use same processing as previous datasets (above).
    # Signal data range: 0 - 4.659
    # P-value data range: 0 - 14.574
    # Hits for 10% FDR range: 0.340833 - 4.659167
    # Signal data distribution:
0.000000 ************************************************************1074500
0.500000 *** 62133
1.000000  7327
1.500000  1543
2.000000  1119
2.500000  529
3.000000  489
3.500000  408
4.000000  104
4.500000  11
    # P-value distribution:
0.000000 ************************************************************ 849713
2.000000 ********* 133584
4.000000 ***** 73052
6.000000 *** 43713
8.000000 ** 25797
10.000000 * 13724
12.000000  6421
14.000000  2159
    # make links to directories:    
    ln -s lab/apr07_batch_b/ghia_12  smarca4_hela
    ln -s lab/apr07_batch_b/ghia_17  smarca6_hela
    ln -s lab/apr07_batch_b/ghia_25  nrsf_hela

cat > loadSig.csh << 'EOF'
#!/bin/csh -ef
  set pfx = encodeYaleChip
  foreach d (smarca4_hela smarca6_hela nrsf_hela)
        set factor = `echo $d | sed 's/_.*//'`
        set cell = `echo $d | sed 's/.*_//'`
        set Factor = `echo $factor | perl -wpe 's/(.*)/\u$1/'`
        set Cell = `echo $cell | perl -wpe 's/(.*)/\u$1/'`
        echo $Factor $Cell

        # load pval 
        set table = ${pfx}Pval$Factor$Cell
        echo $table
        set f = $d/*_*_*-pvalue.wig
        sort -k1,1 -k2,2n $f | \
            /cluster/data/encode/bin/scripts/trimOverlap.pl > $table.trim
        hgLoadBed -bedGraph=4 hg17 $table $table.trim

        # load signal 
        set table = ${pfx}Signal$Factor$Cell
        echo $table
        set f = $d/*_*_*-signal.wig
        sort -k1,1 -k2,2n $f | \
            /cluster/data/encode/bin/scripts/trimOverlap.pl > $table.trim
        hgLoadBed -bedGraph=4 hg17 $table $table.trim 
  end
'EOF'
    chmod +x loadSig.csh
    loadSig.csh >&! loadSig.log &
    egrep 'trim|Loaded' *.log
Reading encodeYaleChipPvalSmarca4Hela.trim
Loaded 382721 elements of size 4
Reading encodeYaleChipSignalSmarca4Hela.trim
Loaded 382721 elements of size 4
Reading encodeYaleChipPvalSmarca6Hela.trim
Loaded 382721 elements of size 4
Reading encodeYaleChipSignalSmarca6Hela.trim
Loaded 382721 elements of size 4
Reading encodeYaleChipPvalNrsfHela.trim
Loaded 382721 elements of size 4
Reading encodeYaleChipSignalNrsfHela.trim
Loaded 382721 elements of size 4
    # Added trackDb entries for subtracks for P-value and Signal Apr07 data.

# New data submitted 2007-06-15 by guoneng.zhong@yale.edu
# Loaded P-value and Signal data (2007-06-18, hartera)
    cd /cluster/data/encode/yale/chip/2007-06-15
    mkdir lab
    cd lab
    unzip yale-jun07_batch.zip
    cd /cluster/data/encode/yale/chip/2007-06-15
    # files are in 1 dir, 1 per experiment
    # antibody is against me3K27 Histone H3, so this is H3K27me3
    # info.txt in experiment dir indicates antibody/cell line
    # files to load are: ghia_?/*_*_*-hits.bed, *signal.wig, *pvalue.wig
    # pvalue and wig appear same format as previous, with overlap,
    # so use same processing as previous datasets (above).
    # Signal data range: 0 - 2.704
    # P-value data range: 0 - 14.574
    # Hits for 10% FDR range: 0.426667 - 2.704167
    # Signal data distribution:
0.000000 ************************************************************ 336894
0.500000 ***** 30711
1.000000 ** 11315
1.500000 * 3403
2.000000  375
2.500000  23

    # P-value distribution:
0.000000 ************************************************************ 291121
2.000000 ***** 23592
4.000000 **** 19303
6.000000 *** 16880
8.000000 *** 14049
10.000000 ** 9892
12.000000 * 5884
14.000000  2000
 
    # make link to directory:    
    ln -s lab/jun07_batch/ghia_49  h3k27me3_hela
cat > loadSig.csh << 'EOF'
#!/bin/csh -ef
  set pfx = encodeYaleChip
  foreach d (h3k27me3_hela)
        set factor = `echo $d | sed 's/_.*//'`
        set cell = `echo $d | sed 's/.*_//'`
        set Factor = `echo $factor | perl -wpe 's/(.*)/\u$1/'`
        set Cell = `echo $cell | perl -wpe 's/(.*)/\u$1/'`
        echo $Factor $Cell

        # load pval 
        set table = ${pfx}Pval$Factor$Cell
        echo $table
        set f = $d/*_*_*-pvalue.wig
        sort -k1,1 -k2,2n $f | \
            /cluster/data/encode/bin/scripts/trimOverlap.pl > $table.trim
        hgLoadBed -bedGraph=4 hg17 $table $table.trim

        # load signal 
        set table = ${pfx}Signal$Factor$Cell
        echo $table
        set f = $d/*_*_*-signal.wig
        sort -k1,1 -k2,2n $f | \
            /cluster/data/encode/bin/scripts/trimOverlap.pl > $table.trim
        hgLoadBed -bedGraph=4 hg17 $table $table.trim 
  end
'EOF'
    chmod +x loadSig.csh
    loadSig.csh >&! loadSig.log &
    egrep 'trim|Loaded' *.log
Reading encodeYaleChipPvalH3k27me3Hela.trim
Loaded 382721 elements of size 4
Reading encodeYaleChipSignalH3k27me3Hela.trim
Loaded 382721 elements of size 4
    # added human/hg17/trackDb.encode.ra subtrack entry for Pval and Sig tracks
    # added information about the antibody and its target to the
    # tracks' description pages.

# Resubmitted hits data for Nov06 and Jan07 - this is for the Sites track.
# submitted 2007-03-20 by guoneng.zhong@yale.edu
# E-mailed to ask whether start coordinates are 0-based or 1-based, 2007-06-06
# Received e-mail on 2007-06-18 to confirm that the start coordinates 
# for the hits data for the Sites track are 1-based.
# Started preparing data (2007-06-19 - 2007-06-20, hartera)
    /cluster/data/encode/yale/chip
    mkdir -p 2007-03-20/lab
    cd 2007-03-20/lab
    cp /var/ftp/encode/ucsc_resubmission.tgz .
    tar tvfz *.tgz
    # Loading resubmitted data (hartera, 2007-06-03) 
    # This is only the hits data as only this has changed for the 
    # November 2006 and January 2007 submissions. Load as subtracks for the
    # Yale ChIP Sites track.
    cd /cluster/data/encode/yale/chip/2007-03-20
    # These data sets relate to the 2006-11-29 data:
    ln -s lab/miyoung_1 pol2n_hela
    ln -s lab/miyoung_2 pol2n_gm06990
    ln -s lab/miyoung_3 pol2_hela
    ln -s lab/miyoung_4 pol2_gm06990
    ln -s lab/miyoung_7 h4kac4_hela
    ln -s lab/miyoung_8 h4kac4_gm06990
    # format is antibody_cell

    # These data sets relate to the 2007-01-30 data:
    ln -s lab/ghia_8_2  baf155_k562
    ln -s lab/ghia_10_1 baf170_k562
    ln -s lab/ghia_14_1 baf47_k562
    ln -s lab/ghia_15 baf47_hela
    ln -s lab/ghia_24 stat1_hela_ifna
    ln -s lab/miyoung_10 p65c_hela_tnfa
    ln -s lab/miyoung_11 p65n_hela_tnfa
    # format for the last 3 is antibody_cell_treatment
    # ifna = interferon alpha, tnfa = tumor necrosis factor (TNF) alpha 
    # These are BED 4 files. Previously, we were sent BED 5 files with an ID
    # to link to the DART database. Add a Sites name and add the False
    # Discovery rate as an extra column. 
    # Load the Sites track data:

# Create a bed5FloatScoreWithFdr.sql table with an extra integer data field
# for the False Discovery Rate (FDR) from the bed5FloatScore.sql definition
# in $HOME/kent/src/hg/lib. The False Discovery Rate is the lowest rate
# used at which the data was included. Data for FDRs of 1, 5 and 10% were 
# submitted for this dataset so, for example an item included with FDR of 1%
# will also be included for an FDR of 5% and 10%, if the FDR is 5% then it is
# also included in the data generated with an FDR of 10%.

cat << 'EOF' > bed5FloatScoreWithFdr.sql
# bed5FloatScore.sql was originally generated by the autoSql program, which also 
# generated bed5FloatScore.c and bed5FloatScore.h.  This creates the database representation of
# an object which can be loaded and saved from RAM in a fairly 
# automatic way.
# bed5FloatScore.sql was edited to create the bed5FloatScoreWithFdr.sql 
# definition.
#BED 5 (with 0-1000 score), but also with floating-point score and false
# discovery rate (FDR).
CREATE TABLE bed5FloatScoreWithFdr (
    bin smallint not null,      # Index field
    chrom varchar(255) not null,	# Chromosome
    chromStart int unsigned not null,	# Start position in chromosome
    chromEnd int unsigned not null,	# End position in chromosome
    name varchar(255) not null,	# Name of item
    score int not null,	# 0-1000 score for useScore shading
    floatScore float not null,	# Floating point score.
    fdr int not null, # False discovery rate
              #Indices
    INDEX(chrom(16),bin),
    INDEX(chrom(16),chromStart),
    INDEX(name(16))
);
'EOF'
# prepare data and load tables for Sites track
# check first that the *hits*.bed files are all sorted for the comm
# command to work correctly. If not, these should be sorted before running the
# loadSites.csh script. 

cat > loadSites.csh << 'EOF'
#!/bin/csh -ef
    set pfx = encodeYaleChip
    foreach d (pol2n_hela pol2n_gm06990 pol2_hela pol2_gm06990 h4kac4_hela h4kac4_gm06990 baf155_k562 baf170_k562 baf47_k562 stat1_hela_ifna p65c_hela_tnfa p65n_hela_tnfa)
        echo $d
        set Stim = ""
        if (($d =~ baf*) || ($d =~ pol2*) || ($d =~ h4kac4*)) then
             set factor = `echo $d | sed 's/_.*//'`
             set cell = `echo $d | sed 's/.*_//'`
             set Factor = `echo $factor | perl -wpe 's/(.*)/\u$1/'`
             set Cell = `echo $cell | perl -wpe 's/(.*)/\u$1/'`
             echo $Factor $Cell
        else
             set factor = `echo $d | sed 's/_.*//'`
             set cell = `echo $d | perl -wpe 's/^.*_(.*)_.*$/$1/'`
             set stim = `echo $d | perl -wpe 's/.*_.*_//' `
             set Factor = `echo $factor | perl -wpe 's/(.*)/\u$1/'`
             set Cell = `echo $cell | perl -wpe 's/(.*)/\u$1/'`
             set Stim = `echo $stim | perl -wpe 's/(.*)/\u$1/'`
             echo $Factor $Cell $Stim
        endif

        # first add FDR rate to Sites data files
        foreach f ($d/*hits*.bed)
        echo $f
        dos2unix $f
           if ($f =~ *hits_1.0.bed) then
              cp $f Sites1.txt
           else if ($f =~ *hits_5.0.bed) then
              cp $f hits5.txt
           else if ($f =~ *hits_10.0.bed) then
              cp $f hits10.txt
           endif 
        end
        comm -13 Sites1.txt hits5.txt > Sites5.txt
        comm -13 hits5.txt hits10.txt > Sites10.txt
    
        awk 'BEGIN {OFS="\t";} {print $0 "1"}' Sites1.txt > SitesWithFdr1.txt
        awk 'BEGIN {OFS="\t";} {print $0 "5"}' Sites5.txt > SitesWithFdr5.txt
        awk 'BEGIN {OFS="\t";} {print $0 "10"}' Sites10.txt > SitesWithFdr10.txt
        cat SitesWithFdr1.txt SitesWithFdr5.txt SitesWithFdr10.txt > ${d}SitesFdr.txt
        rm hits*.txt Sites*.txt

        # load Sites tables 
        set f = ${d}SitesFdr.txt
        set table = ${pfx}Sites$Factor$Cell$Stim
        echo $table
        sed -e "s/bed5FloatScoreWithFdr/$table/" \
            bed5FloatScoreWithFdr.sql > $table.sql
        sed 's/=/ /' $f | awk 'BEGIN {ct = 0} \
{ct++; printf "%s\t%d\t%d\t%s\t%d\t%.3f\t%d\n",$1,$2-1,$3,"Site"ct,($4 * 330),$4,$5}' \
            | hgLoadBed -sqlTable=$table.sql hg17 $table stdin
        end
'EOF'
    chmod +x loadSites.csh
    # load tables (hartera, 2007-06-23)
    loadSites.csh >&! loadSites.log &
    egrep 'encode|Loaded' *.log
encodeYaleChipSitesPol2nHela
Loaded 1000 elements of size 7
encodeYaleChipSitesPol2nGm06990
Loaded 1000 elements of size 7
encodeYaleChipSitesPol2Hela
Loaded 1000 elements of size 7
encodeYaleChipSitesPol2Gm06990
Loaded 1000 elements of size 7
encodeYaleChipSitesH4kac4Hela
Loaded 1000 elements of size 7
encodeYaleChipSitesH4kac4Gm06990
Loaded 1000 elements of size 7
encodeYaleChipSitesBaf155K562
Loaded 5 elements of size 7
encodeYaleChipSitesBaf170K562
Loaded 675 elements of size 7
encodeYaleChipSitesBaf47K562
Loaded 668 elements of size 7
encodeYaleChipSitesStat1HelaIfna
Loaded 1000 elements of size 7
encodeYaleChipSitesP65cHelaTnfa
Loaded 1000 elements of size 7
encodeYaleChipSitesP65nHelaTnfa
Loaded 1000 elements of size 7
    # Added subtracks to trackDb/human/hg17/trackDb.encode.ra for
    # the encodeYaleChipSites track. Release beta version created with just 
    # the existing tracks. Release alpha version includes new subtracks. 
    # Added new antibodies and treatments to the description page. Created 
    # new description: encodeYaleChipSitesNew.html for release alpha in the 
    # trackDb.entry.  
    
    # Load hits data for Apr07 batch into the Sites track. The April data
    # batch was originally submitted on 2007-03-27 but there was some
    # missing data so it was resubmitted on 2007-06-06 by 
    # guoneng.zhong@yale.edu
    # (hartera, 2007-06-23)
    cd /cluster/data/encode/yale/chip/2007-06-06
    # symbolic links already made for Pval and Signal data:
    # smarca4_hela, smarca6_hela, nrsf_hela, format is antibody_cell
    # copy over the MySQL table definition:
    cp /cluster/data/encode/yale/chip/2007-03-20/bed5FloatScoreWithFdr.sql . 

# check first that the *hits*.bed files are all sorted for the comm
# command to work correctly. If not, these should be sorted before running the
# loadSites.csh script. 
    # create loading script:
cat > loadSites.csh << 'EOF'
#!/bin/csh -ef
    set pfx = encodeYaleChip
    foreach d (smarca4_hela smarca6_hela nrsf_hela)
        echo $d
        set factor = `echo $d | sed 's/_.*//'`
        set cell = `echo $d | sed 's/.*_//'`
        set Factor = `echo $factor | perl -wpe 's/(.*)/\u$1/'`
        set Cell = `echo $cell | perl -wpe 's/(.*)/\u$1/'`
        echo $Factor $Cell

        # first add FDR rate to Sites data files
        foreach f ($d/*hits*.bed)
           echo $f
           dos2unix $f
           if ($f =~ *hits_1.0.bed) then
              cp $f Sites1.txt
           else if ($f =~ *hits_5.0.bed) then
              cp $f hits5.txt
           else if ($f =~ *hits_10.0.bed) then
              cp $f hits10.txt
           endif
        end
        comm -13 Sites1.txt hits5.txt > Sites5.txt
        comm -13 hits5.txt hits10.txt > Sites10.txt

        awk 'BEGIN {OFS="\t";} {print $0 "1"}' Sites1.txt > SitesWithFdr1.txt
        awk 'BEGIN {OFS="\t";} {print $0 "5"}' Sites5.txt > SitesWithFdr5.txt
        awk 'BEGIN {OFS="\t";} {print $0 "10"}' Sites10.txt > SitesWithFdr10.txt
        cat SitesWithFdr1.txt SitesWithFdr5.txt SitesWithFdr10.txt > ${d}SitesFdr.txt
        rm hits*.txt Sites*.txt

        # load Sites tables 
        set f = ${d}SitesFdr.txt
        set table = ${pfx}Sites$Factor$Cell
        echo $table
        sed -e "s/bed5FloatScoreWithFdr/$table/" \
            bed5FloatScoreWithFdr.sql > $table.sql
        sed 's/=/ /' $f | awk 'BEGIN {ct = 0} \
{ct++; printf "%s\t%d\t%d\t%s\t%d\t%.3f\t%d\n",$1,$2-1,$3,"Site"ct,($4 * 330),$4,$5}' \
            | hgLoadBed -sqlTable=$table.sql hg17 $table stdin
        end
'EOF'
    chmod +x loadSites.csh
    loadSites.csh >&! loadSites.log &
    egrep 'encode|Loaded' loadSites.log
encodeYaleChipSitesSmarca4Hela
Loaded 470 elements of size 7
encodeYaleChipSitesSmarca6Hela
Loaded 1000 elements of size 7
encodeYaleChipSitesNrsfHela
Loaded 1639 elements of size 7
    # Add new subtracks to encodeYaleChipSites release alpha
    # trackDb.encoder.ra entry and update description as for the Nov06 and
    # Jan07 data - see above.

    # Load hits data for Jun07 batch into the Sites track.
    # Data was submitted on 2007-06-15 by guoneng.zhong@yale.edu
    # (hartera, 2007-06-23)
    cd /cluster/data/encode/yale/chip/2007-06-15
    # symbolic links already made for Pval and Signal data:
    # h3k27me3_hela format is antibody_cell
    # copy over the MySQL table definition:
    cp /cluster/data/encode/yale/chip/2007-03-20/bed5FloatScoreWithFdr.sql . 
    
# check first that the *hits*.bed files are all sorted for the comm
# command to work correctly. If not, these should be sorted before running the
# loadSites.csh script. 
    # create loading script:
cat > loadSites.csh << 'EOF'
#!/bin/csh -ef
    set pfx = encodeYaleChip
    foreach d (h3k27me3_hela)
        echo $d
        set factor = `echo $d | sed 's/_.*//'`
        set cell = `echo $d | sed 's/.*_//'`
        set Factor = `echo $factor | perl -wpe 's/(.*)/\u$1/'`
        set Cell = `echo $cell | perl -wpe 's/(.*)/\u$1/'`
        echo $Factor $Cell

        # first add FDR rate to Sites data files
        foreach f ($d/*hits*.bed)
           echo $f
           dos2unix $f
           if ($f =~ *hits_1.0.bed) then
              cp $f Sites1.txt
           else if ($f =~ *hits_5.0.bed) then
              cp $f hits5.txt
           else if ($f =~ *hits_10.0.bed) then
              cp $f hits10.txt
           endif
        end
        comm -13 Sites1.txt hits5.txt > Sites5.txt
        comm -13 hits5.txt hits10.txt > Sites10.txt

        awk 'BEGIN {OFS="\t";} {print $0 "1"}' Sites1.txt > SitesWithFdr1.txt
        awk 'BEGIN {OFS="\t";} {print $0 "5"}' Sites5.txt > SitesWithFdr5.txt
        awk 'BEGIN {OFS="\t";} {print $0 "10"}' Sites10.txt > SitesWithFdr10.txt
        cat SitesWithFdr1.txt SitesWithFdr5.txt SitesWithFdr10.txt > ${d}SitesFdr.txt
        rm hits*.txt Sites*.txt

        # load Sites tables 
        set f = ${d}SitesFdr.txt
        set table = ${pfx}Sites$Factor$Cell
        echo $table
        sed -e "s/bed5FloatScoreWithFdr/$table/" \
            bed5FloatScoreWithFdr.sql > $table.sql
        sed 's/=/ /' $f | awk 'BEGIN {ct = 0} \
{ct++; printf "%s\t%d\t%d\t%s\t%d\t%.3f\t%d\n",$1,$2-1,$3,"Site"ct,($4 * 330),$4,$5}' \
            | hgLoadBed -sqlTable=$table.sql hg17 $table stdin
        end
'EOF'
    chmod +x loadSites.csh
    loadSites.csh >&! loadSites.log &
    egrep 'encode|Loaded' loadSites.log
encodeYaleChipSitesH3k27me3Hela
Loaded 2553 elements of size 7
    # Add new subtracks to encodeYaleChipSites release alpha
    # trackDb.encoder.ra entry and update description as for the Nov06 and
    # Jan07 data - see above.
    # GEO accessions for each experiment were added to the descriptions for 
    # the Yale Chip Pval, Signal and Sites tracks in the table of the 
    # antibody and antibody target descriptions.
  
    foreach b (1, 5, 10)
       awk 'BEGIN {OFS="\t"} {print $0, $b;}' Sites${b}.bed
    end
    comm -13 70561_70573_70629-hits_1.0.bed 70561_70573_70629-hits_5.0.bed \
         > hits5.0.bed
    comm -13 70561_70573_70629-hits_5.0.bed 70561_70573_70629-hits_10.0.bed \
         > hits10.0 bed
    
    # Load hits data for Apr07 batch:
    # Load hits data for Jun07 batch:

##########################################################################
# UTexas STAGE (2005-10-31, 11-17 kate)
# Submitted 10/15 by Akshay Bhinge <akshayb@mail.utexas.edu>
# Resubmitted 11/17
#  2 files - raw and peaks, for c-Myc in HeLa
# range .001 to 1.0.  Peaks restricted to >.8
# Adjusted data in Tags file:  set score=1 items to 300 so
# they'll be visible with gray-scale tags requested by Akshay.
# (This is why it's loaded as blocked bed).  Huh ??
#
# New data (raw tags for STAT1 in HeLa) submitted 2006-10-16
    #cd /cluster/data/encode/UTexas/stage/2005-10-15
    cd /cluster/data/encode/UTexas/stage/2005-11-17
    grep '^chr' lab/myc.tag.prob.bed | \
        awk '{if ($5 == 1) $5 = 300; \
                printf("%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t0\t1\t%d,\t0\n", \
                $1, $2, $3, $4, $5, $6, $2, $3, $3 - $2)}' | \
        hgLoadBed -noBin -strict hg17 encodeUtexStageMycHelaTags stdin
            # 813 elements
    grep '^chr' lab/myc.stage.peaks.bed | \
        hgLoadBed -noBin -strict hg17 \
                encodeUtexStageMycHelaPeaks  stdin
            # 26 elements
    # Created composite track with 2 subtracks

    # 2006-10-30
    # Reload cMyc data in simple bed w/o score adjustment
    # and load new data

    cd /cluster/data/encode/UTexas/stage
    mkdir -p 2006-10-16/lab
    rm latest; ln -s 2006-10-16 latest
    cd latest
    grep '^chr' lab/stat1.tags.ucsc.bed | \
       awk '{printf("%s\t%d\t%d\t%s\t%d\n",$1,$2,$3,$4,$5)}' | \
    hgLoadBed -noBin -strict hg17 encodeUtexStageStat1HelaTags stdin
        # Loaded 937 elements of size 6
    checkTableCoords hg17 encodeUtexStageStat1HelaTags

    cd ../2005-11-17
    grep '^chr' lab/myc.tag.prob.bed | \
       awk '{printf("%s\t%d\t%d\t%s\t%d\n",$1,$2,$3,$4,$5)}' | \
    hgLoadBed -noBin -strict hg17 encodeUtexStageCMycHelaTags stdin
    
##########################################################################
# Univ. Uppsala, Sweden Chip/chip
# Submitted by Claes & Ola
# 4 files with chrom, start, end, integer score
# Sites file in GFF format
# NOTE: this was submitted in hg16, without notifying us.
# I'm reloading after lifting, 2005-12-05

    cd /cluster/data/encode/Uppsala/2005-11-07
    /cluster/data/encode/bin/scripts/splitTracks.pl lab/chip.wig
    mv t0 Usf1.wig
    mv t1 Hnf3b.wig
    mv t2 Hnf4a.wig
    mv t3 Ach3.wig
    mv t4 Sites.gff

    # load data for individual factors
    # NOTE: rounded overly long float scores
cat > load.csh << 'EOF'
    foreach factor (Usf1 Hnf3b Hnf4a Ach3)
      awk '/^chr/ {printf("%s\t%s\t%s\t%.3f\n", $1, $2, $3, $4)}' $factor.wig |\
        liftOver stdin /cluster/data/encode/convertHg17/hg16ToHg17.chain \
                $factor.hg17.bed $factor.unmapped
        hgLoadBed -strict -bedGraph=4 hg17 encodeUppsalaChip$factor $factor.hg17.bed
    end
'EOF'
    csh load.csh >&! load.log &
    
    # sites (they refer to as Tentative Binding Sites)
    # NOTE: I added an item name, of the form "uutbs.#"
    grep -v track Sites.gff | sort -k1,1 -k2,2n | \
        awk '{printf ("%s\t%d\t%d\tuutbs.%d\t%d\n", $1, $4, $5, NR, $6)}' | \
            liftOver stdin /cluster/data/encode/convertHg17/hg16ToHg17.chain \
                sites.hg17.bed sites.unmapped
            hgLoadBed -noSort -noBin -strict hg17 encodeUppsalaChipSites sites.hg17.bed
                # Loaded 327 elements of size 5

##########################################################################
# MSA tracks from Sept. 2005 freeze
# Use links from Wiki for data submission (as per Elliott Margulies)
# NOTE: mapping of sequence name to assembly is in column 7 of
# metadata.txt file in Elliott's MSA release
# Assemblies in this freeze are: canFam1 danRer2 fr1 galGal2 mm6
#       monDom1 panTro1 rheMac1 rn3 tetNig1
# NOTE: reloaded phastCons scores (previously only manual regions
#       were loaded (2006-05-03 kate)
# Reloaded elements with updated files from Elliott (2006-06-22 kate)
    # TBA alignments
    cd /cluster/data/encode/TBA
    mkdir -p SEP-05/lab
    cd SEP-05/lab
    wget ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/msa/SEP-2005/tba.v2.maf.tar

    cd ..
    foreach f (lab/*/*.maf.gz)
        echo $f
        gunzip -c $f | \
            sed 's/^s human\./s hg17./;          s/^s dog\./s canFam1./; \
                 s/^s zebrafish\./s danRer2./;   s/^s fugu\./s fr1./; \
                 s/^s chicken\./s galGal2./;     s/^s mouse\./s mm6./; \
                 s/^s monodelphis\./s monDom1./; s/^s chimp\./s panTro1./; \
                 s/^s macaque\./s rheMac1./;     s/^s rat\./s rn3./; \
                 s/^s tetraodon\./s tetNig1./;' \
                        > $f:t:r:r:e.maf
    end
    set gdir = /gbdb/hg17/encode/TBA/maf
    mkdir -p $gdir
    rm -f $gdir/*.maf
    ln -s /cluster/data/encode/TBA/SEP-05/*.maf $gdir
    hgLoadMaf -pathPrefix=$gdir -WARN hg17 encodeTbaAlign >&! load.log
    # lots of "score too small" messages -- these are OK.
    cat *.maf | hgLoadMafSummary hg17 encodeTbaSummary stdin

    # create tree image:
    # edit tree.nh to create species.nh with common names
    cd /cluster/data/encode/MSA/SEP-2005
    mkdir phylo
    cd phylo
    wget ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/msa/SEP-2005/phylo/tree_4d.tba.v2.nh
    /cluster/bin/phast/draw_tree -b -s tree_4d.tba.v2.nh > species28.ps
        # photoshop to enhance, then save as gif/jpg
        cp /cluster/data/encode/MSA/SEP-2005/phylo/species28.jpg \
            /usr/local/apache/htdocs/images/phylo/species28.jpg

    #  MLAGAN alignments
    cd /cluster/data/encode/MLAGAN
    mkdir -p SEP-05/lab
    cd SEP-05/lab
    wget http://ai.stanford.edu/~asimenos/ENCODE_Oct-2005_maf.tgz
    cd ..
cat > project.csh << 'EOF'
    mkdir -p tmp
    set tmpDir = tmp
    foreach d (lab/EN[mr]*)
        set r = $d:t
        echo $r
        set c = `echo "SELECT chrom from encodeRegions WHERE name='$r'" | \
                        hgsql -N hg17`
        set start =  \
                `echo "SELECT chromStart from encodeRegions WHERE name='$r'" | \
                        hgsql -N hg17`
        set size = \
                `echo "SELECT size from chromInfo WHERE chrom='$c'" | \
                        hgsql -N hg17`
        /cluster/data/encode/MLAGAN/mafCoord.pl < $d/$r.maf \
                human.1 hg17.$c $start $size | \
            sed 's/^a$/a score=0.0/' > $tmpDir/$r.db.maf
        echo "projecting $r"
        /cluster/bin/penn/maf_project $tmpDir/$r.db.maf hg17.$c > $r.maf
        echo "finished $r"
    end
'EOF'
    set gdir = /gbdb/hg17/encode/MLAGAN/SEP-05/maf
    mkdir -p $gdir
    rm -f $gdir/*.maf
    ln -s /cluster/data/encode/MLAGAN/SEP-05/*.maf $gdir
    hgLoadMaf -pathPrefix=$gdir -WARN hg17 encodeMlaganAlign >&! load.log
    # lots of "score too small" messages -- these are OK.
    cat *.maf | hgLoadMafSummary hg17 encodeMlaganSummary stdin

    # MAVID alignments
    cd /cluster/data/encode/MAVID
    mkdir -p SEP-05/lab
    cd SEP-05/lab
    wget http://hanuman.math.berkeley.edu/~cdewey/encode/alignments/ENCODE_SEP-2005_MAVID_MAF_ABS.tar.gz
    cd ..
cat > project.csh << 'EOF'
    set tmpDir = tmp
    mkdir $tmpDir
    foreach f (lab/ABS/*.maf)
        set r = $f:t:r
        echo $r
        set c = `echo "SELECT chrom from encodeRegions WHERE name='$r'" | \
                        hgsql -N hg17`
        sed 's/^a$/a score=0.0/; s/^s  *human/s hg17/' $f > $tmpDir/$r.maf
        echo "projecting $r"
        /cluster/bin/penn/maf_project $tmpDir/$r.maf hg17.$c > $r.maf
        echo "finished $r"
    end
'EOF'
    set gdir = /gbdb/hg17/encode/MAVID/SEP-05/maf
    mkdir -p $gdir
    rm -f $gdir/*.maf
    ln -s /cluster/data/encode/MAVID/SEP-05/*.maf $gdir
    hgLoadMaf -pathPrefix=$gdir -WARN hg17 encodeMavidAlign >&! load.log
    cat *.maf | hgLoadMafSummary hg17 encodeMavidSummary stdin

    # conserved elements
    # Scores:  binCons are all 1000, gerp range is 6.75 - 4813.26
    #          phastCons is 10-18088
    # Force gerp to integer for consistent table format, but don't
    # bother scaling at this point (and don't use to score on display)
    # For some reason, phastCons has + strand -- strip this out
    # NOTE: coords are ENCODE-region based, so need to adjust
    # by start of region (Elliott used custom tracks offset= to do this).

    # NOTE: Updated GERP elements 2/1/06, with new data from Greg Cooper
    # overwriting Elliott's elements.  This is doc'ed in the GERP section.
    cd /cluster/data/encode/MSA
    mkdir -p SEP-05/elements.2005-12-12/lab
    cd SEP-05/elements.2005-12-12/lab
    wget ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/msa/SEP-2005/cons/target.align.conservation.v1.tar.gz

    # data update from Elliott , to fix off-by-one start coords
    mkdir -p SEP-05/elements.2006-06-22/lab
    cd SEP-05/elements.2006-06-22/lab
    # copy in align_elements_tracks.tar.gz
    # contains 9 tracks of elements (3 aligners * binCons, gerp, phastCons)
    cd ..
cat > load.csh << 'EOF'
    foreach f (lab/*.bed)
        set root = $f:t:r
        set align = `echo $root:e | perl -wpe  's/(.*)/\u$1/'`
        set cons = `echo $root:r | perl -wpe 's/(.*)/\u$1/'`
        set table = encode${align}${cons}El
        hgLoadBed -strict hg17 $table $f
    end
'EOF'
    csh load.csh >&! load.log &

    # CONSENSUS ELEMENTS
    cd /cluster/data/encode/MSA
    mkdir -p SEP-05/consensus/lab
    cd SEP-05/consensus/lab
    wget ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/msa/SEP-2005/cons/consensus.conservation.v1.tar.gz
    cd ..
    ln -s lab/or.or.bed MsaElUnion.bed
    ln -s lab/and.and.bed MsaElIntersect.bed
    ln -s lab/two.two.bed MsaElModerate.bed
cat > load.csh << 'EOF'
    foreach f (MsaEl*.bed)
        echo $f
        set b = $f:r
        set t = encode$b
        hgLoadBed -strict -noBin hg17 $t $f
    end
'EOF'
    csh load.csh >&! load.log
        #Reading MsaElIntersect.bed
        #Loaded 30645 elements of size 4
        #Reading MsaElModerate.bed
        #Loaded 36793 elements of size 4

    # conservation
    cd /cluster/data/encode/MSA
    mkdir -p SEP-05/conservation/lab
    cd SEP-05/conservation/lab
    wget ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/msa/SEP-2005/cons/phastCons.wig.tar.gz
    cd ..

cat > load.csh << 'EOF'
    # TBA
    gunzip -c lab/tba/*/phast/human.EN*.gz | \
        wigEncode stdin tbaPhastCons.wig tbaPhastCons.wib
    set d = /gbdb/hg17/encode/TBA/SEP-05
    ln -s `pwd`/tbaPhastCons.wib $d
    hgLoadWiggle -pathPrefix=$d hg17 encodeTbaPhastCons tbaPhastCons.wig

    # MLAGAN
    gunzip -c lab/mlagan/*/phast/human.EN*.gz | \
        wigEncode stdin mlaganPhastCons.wig mlaganPhastCons.wib
    set d = /gbdb/hg17/encode/MLAGAN/SEP-05
    ln -s `pwd`/mlaganPhastCons.wib $d
    hgLoadWiggle -pathPrefix=$d hg17 encodeMlaganPhastCons mlaganPhastCons.wig

    # MAVID
    gunzip -c lab/mavid/*/phast/human.EN*.gz | \
        wigEncode stdin mavidPhastCons.wig mavidPhastCons.wib
    set d = /gbdb/hg17/encode/MAVID/SEP-05
    ln -s `pwd`/mavidPhastCons.wib $d
    hgLoadWiggle -pathPrefix=$d hg17 encodeMavidPhastCons mavidPhastCons.wig

'EOF'
    csh load.csh >&! load.log &

##########################################################################
# MSA GERP Conservation (2005-02-06 kate)
#  Submitted 2/1/06 by Greg Coooper

    cd /cluster/data/encode/MSA/Gerp
    mkdir -p 2006-02-01/lab
    cd 2006-02-01/lab
    wget http://baumbox.stanford.edu/~coopergm/ENCODE/GERP_Cons_SepFreeze_Jan.zip 
    unzip GERP_Cons_SepFreeze_Jan.zip 
    cd ..

    # TBA
    cat lab/chr*_GERP_TBA_scores.wig | \
        wigEncode stdin tbaGerpCons.wig tbaGerpCons.wib
        #  upper limit 4.48, lower limit -29.86
    set d = /gbdb/hg17/encode/TBA/SEP-05
    ln -s /cluster/data/encode/MSA/Gerp/2006-02-01/tbaGerpCons.wib $d
    hgLoadWiggle -pathPrefix=$d hg17 encodeTbaGerpCons tbaGerpCons.wig

    # MLAGAN 
    cat lab/chr*_GERP_MLAGAN_scores.wig | \
        wigEncode stdin mlaganGerpCons.wig mlaganGerpCons.wib
        #  upper limit 4.48, lower limit -25.74
    set d = /gbdb/hg17/encode/MLAGAN/SEP-05
    ln -s /cluster/data/encode/MSA/Gerp/2006-02-01/mlaganGerpCons.wib $d
    hgLoadWiggle -pathPrefix=$d hg17 encodeMlaganGerpCons mlaganGerpCons.wig

    # MAVID
    cat lab/chr*_GERP_MAVID_scores.wig | \
        wigEncode stdin mavidGerpCons.wig mavidGerpCons.wib
        # upper limit 4.48, lower limit -22.58
    set d = /gbdb/hg17/encode/MAVID/SEP-05
    ln -s /cluster/data/encode/MSA/Gerp/2006-02-01/mavidGerpCons.wib $d
    hgLoadWiggle -pathPrefix=$d hg17 encodeMavidGerpCons mavidGerpCons.wig

    # Elements.  Note: scores from 307-1000.  This data also
        # upper limit 4.48, lower limit -22.58
    # includes a 6th field with an unscaled float score, which
    # will be included in the table, but not used for display
    # with unscaled scores.

    # Adding item names (<region>.#) for consistency with other MSA elements
    # subtracks

    lab/GERP_TBA_Cons.bed

    # Post wiggles for downloads (2007-04-16 kate)
    ssh kkstore03
    cd /cluster/data/encode/MSA/Gerp
    cd 2006-02-01
    mkdir downloads
cat > makeDownloads.csh << 'EOF'
    foreach prog (TBA MLAGAN MAVID)
        echo $prog
        ls lab/chr*_${prog}_*.wig | \
            sed 's/lab\/chr//' | sort -n | sed 's/^/lab\/chr/' | \
            xargs cat | \
            gzip -c -4 > download/GERP_${prog}.scores.wig.gz
    end
'EOF'
# << happy emacs
    csh makeDownloads.csh >&! makeDownloads.log &


##########################################################################
# MSA SCONE Conservation (2005-12-12 kate)
# From Harvard Med School, Saurabh Asthana <faplap@gmail.com>
# Reusbmitted 12/21/05
# Resubmitted 6/22/07 (kate)

    cd /cluster/data/encode/MSA
    mkdir -p SconeCons/2005-12-21/lab
    ln -s SconeCons/2005-12-21 latest
    cd latest/lab
    mkdir bed; cd bed
    wget http://genetics.bwh.harvard.edu/graft/bed/sconeRegions.NOV-2005.bed.tar.bz2
    bunzip2 sconeRegions.NOV-2005.bed.tar.bz2
    tar xvf sconeRegions.NOV-2005.bed.tar
    cd ..

    cd ..; mkdir wig; cd wig
    wget http://ika.bwh.harvard.edu/graft/wig/scone.NOV-2005.wig.tar.bz2
    bunzip2 scone.NOV-2005.wig.tar.bz2
    tar xvf scone.NOV-2005.wig.tar
    cd ../..

    # elements
cat > load.csh << 'EOF'
    set out = sconeRegions.bed
    rm -f $out
    foreach f (lab/bed/*.bed)
        set r = $f:t:r
        echo $r
        grep '^chr' $f | \
            awk '{printf("%s\t%d\t%d\t%s\t1000\n", \
                        $1,$2,$3,$4)}' >> $out
    end
    hgLoadBed -strict hg17 encodeTbaSconeEl $out
'EOF'
    csh load.csh >&! load.log &
        # Loaded 18817 elements 

    featureBits -enrichment hg17 encodeRegions encodeTbaSconeEl
        # encodeRegions 1.047%, encodeTbaSconeEl 0.083%, both 0.083%, cover 7.92%, enrich 95.55x
    featureBits -enrichment hg17 encodeRegions encodeTbaPhastConsEl
       # encodeRegions 1.047%, encodeTbaPhastConsEl 0.063%, both 0.063%, cover 6.04%, enrich 95.55x
    featureBits -enrichment hg17 encodeRegions encodeTbaGerpEl
       # encodeRegions 1.047%, encodeTbaGerpEl 0.057%, both 0.057%, cover 5.47%, enrich 95.55x
    featureBits -enrichment hg17 encodeRegions encodeTbaBinConsEl
        # encodeRegions 1.047%, encodeTbaBinConsEl 0.060%, both 0.060%, cover 5.71%, enrich 95.55x

    # conservation
    cat lab/wig/*.wig | \
        wigEncode stdin tbaScone.wig tbaScone.wib
    set d = /gbdb/hg17/encode/TBA/SEP-05
    ln -s `pwd`/tbaScone.wib $d
    hgLoadWiggle -pathPrefix=$d hg17 encodeTbaSconeCons tbaScone.wig 

# Resubmitted 6/22/07 (kate)
    ssh kkstore03
    cd /cluster/data/encode/MSA/SconeCons
    mkdir -p 2007-06-22/lab
    rm latest
    ln -s SconeCons/2007-06-22 latest
    cd latest/lab
    wget -r -nd http://ika.bwh.harvard.edu/graft/tracks/pvalue.SCONE.wig.tar.bz2
    bunzip2 pvalue.SCONE.wig.tar.bz2
    tar xvf pvalue.SCONE.wig.tar
    cd ..

    # conservation
    ssh hgwdev
    cd /cluster/data/encode/MSA/SconeCons/latest
    cat lab/*.wig | \
        wigEncode stdin tbaScone.wig tbaScone.wib
    set d = /gbdb/hg17/encode/TBA/SEP-05/Update/Scone
    mkdir -p $d
    ln -s /cluster/data/encode/MSA/SconeCons/2007-06-22/tbaScone.wib $d
    hgLoadWiggle -pathPrefix=$d hg17 encodeTbaSconeConsUpdate tbaScone.wig 

    # elements
    ssh kkstore03
    cd /cluster/data/encode/MSA/SconeCons
    cd 2007-06-22/lab
    mkdir bed
    cd bed
    wget -r -nd http://ika.bwh.harvard.edu/graft/tracks/elements.SCONE.bed.tar.bz2
    bunzip2 elements.SCONE.bed.tar.bz2
    tar xvf elements.SCONE.bed.tar
    cd ..

cat > load.csh << 'EOF'
    set out = sconeRegions.bed
    rm -f $out
    foreach f (lab/bed/*.bed)
        set r = $f:t:r
        echo $r
        grep '^chr' $f | \
            awk '{printf("%s\t%d\t%d\t%s\t1000\n", \
                        $1,$2,$3,$4)}' >> $out
    end
'EOF'
    csh load.csh >&! load.log &
    ssh hgwdev
    cd /cluster/data/encode/MSA/SconeCons/latest
    hgLoadBed hg17 encodeTbaSconeElUpdate sconeRegions.bed
        # Loaded 33293 elements of size 5
        # previous: Loaded 18817 elements 

##########################################################################
# MSA Conservation  (2005-12-07 kate)
# Just phastCons and GERP for this freeze  (x3 aligners)

    cd /cluster/data/encode/MSA/SEP-05
    mkdir -p conservation/lab
    cd conservation/lab
    wget ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/msa/SEP-2005/cons/phastCons.wig.tar.gz
    tar xvfz phastCons.wig.tar.gz
    cd ..

cat > load.csh << 'EOF'
    # TBA
    gunzip -c lab/tba/*/phast/human.ENm*.gz | \
        wigEncode stdin tbaPhastCons.wig tbaPhastCons.wib
    set d = /gbdb/hg17/encode/TBA/SEP-05
    ln -s `pwd`/tbaPhastCons.wib $d
    hgLoadWiggle -pathPrefix=$d hg17 encodeTbaPhastCons tbaPhastCons.wig 

    # MLAGAN
    gunzip -c lab/mlagan/*/phast/human.ENm*.gz | \
        wigEncode stdin mlaganPhastCons.wig mlaganPhastCons.wib
    set d = /gbdb/hg17/encode/MLAGAN/SEP-05
    ln -s `pwd`/mlaganPhastCons.wib $d
    hgLoadWiggle -pathPrefix=$d hg17 encodeMlaganPhastCons mlaganPhastCons.wig 

    # MAVID
    gunzip -c lab/mavid/*/phast/human.ENm*.gz | \
        wigEncode stdin mavidPhastCons.wig mavidPhastCons.wib
    set d = /gbdb/hg17/encode/MAVID/SEP-05
    ln -s `pwd`/mavidPhastCons.wib $d
    hgLoadWiggle -pathPrefix=$d hg17 encodeMavidPhastCons mavidPhastCons.wig 
'EOF'
    csh load.csh >&! load.log

          
##########################################################################
# MSA alignment agreement
#  From Ariel Schwartz, UC Berkeley

    cd /cluster/data/encode/MSA
    mkdir alignAgreement/2005-11-16/lab
    cd alignAgreement/2005-11-16/lab
    touch Mean.wig MavidMlagan.wig MavidTba.wig MlaganTba.wig 
    touch MavidUngapped.wig MlaganUngapped.wig TbaUngapped.wig
cat > split.csh << 'EOF'
    foreach f (lab/*.wig.gz)
        echo $f
        gunzip $f
        /cluster/data/encode/bin/scripts/splitTracks.pl $f:r
        cat t0 >> Mean.wig
        cat t1 >> MavidMlagan.wig
        cat t2 >> MavidTba.wig
        cat t3 >> MlaganTba.wig
        cat t4 >> MavidUngapped.wig
        cat t5 >> MlaganUngapped.wig
        cat t6 >> TbaUngapped.wig
        rm t?
        gzip $f:r
    end
'EOF'
    csh split.csh >&! split.log &
    mkdir wig wib
cat > load.csh << 'EOF'
    set dir = /gbdb/hg17/encode/MSA/alignAgree/2005-11-16
    mkdir -p $dir
    foreach f (*.wig)
        set table = encodeMsaAlign$f:r
        echo $table
        egrep -v "browser|track" $f | \
            wigEncode stdin wig/$table.wig wib/$table.wib
        hgLoadWiggle -pathPrefix=$dir hg17 $table wig/$table.wig
        ln -s `pwd`/wib/$table.wib $dir
    end
'EOF'
    csh load.csh >&! load.log &

##########################################################################
# Harvard TBA Conservation (2005-12-12 kate)
#  From <faplap@gmail.com> Saurabh Asthana
# Dept. of Medicine, Brigham & Women's Hospital, Harvard Medical School

    cd /cluster/data/encode/MSA/SconeCons
    mkdir -p 2005-12-01/lab
    cd 2005-12-01/lab
    wget http://ika.bwh.harvard.edu/graft/wig/scone.NOV-2005.wig.tar.bz2
    wget http://genetics.bwh.harvard.edu/graft/bed/sconeRegions.NOV-2005.bed.tar.bz2
    mkdir -p bed wig
    # NOTE: files are actually gzipped
    mv scone.NOV-2005.wig.tar.bz2 wig/scone.wig.tar.gz
    mv sconeRegions.NOV-2005.bed.tar.bz2 bed/sconeRegions.bed.tar.gz
    cd ..

    # Conservation scores
    cat lab/wig/*.wig | grep -v track | \
        wigEncode 
    # Conserved Elements
    # Add these to the TBA Elements track as a subtrack
    # For table consistency, assign item names of the form <region>.#,
    # and a score=1000
    set bed = sconeRegions.bed
    rm $out
    foreach f (lab/bed/*.bed)
        set r = $f:t:r
        echo $r
        grep '^chr' $f | \
            awk -v REGION=$r '{printf("%s\t%d\t%d\t%s.%d\t%d\n", \
                $1,$2,$3,REGION, NR,1000)}' >> $bed
    end
    hgLoadBed -strict hg17 encodeTbaSconeEl $bed
        # Loaded 18784 elements of size 5

    
##########################################################################
# UW/Regulome Chromatin Accessibility Profiling (CAP) - RENAMED, see below
#  Submitted 2006-1-17 by Scott Kuehn
# update of data received on 2006-05-04 (sent to Kate) by Scott Kuehn
# Update done 2006-05-19 - 2006-05-23 (hartera)
# Not called CAP anymore, now called DNase array for DNase I
# Track short label is now: UW DNase GM 
# long label: ENCODE UW DNase/Array GM06690 - DNase I 
# sensitivity/hypersensitivity in GM06990 Cells 
# Data is for lymphoblastoid cells (GM06990).
# Updated long label in trackDb.ra to: UW Array DNase I 
# sensitivity/hypersensitivity in GM06990 Cells (hartera, 2007-02-26)
    cd /cluster/data/encode/Regulome
    mkdir -p 2006-05-04/lab
    cd 2006-05-04
    awk '{printf("%s\t%s\t%s\t%.3f\n", $1, $2, $3, $5)}' \
                lab/Encode.DNase-Array-GM06990.Probes.hg17.bed | \
        sort -k1,1 -k2,2n  | \
        /cluster/data/encode/bin/scripts/trimOverlap.pl > sens.bed
    hgLoadBed -strict -bedGraph=4 hg17 encodeRegulomeDnaseGM06990Sens sens.bed

    # the Encode.DNase-Array-GM06990.DHSs.hg17.bed file has a float score
    # use the encodeRegulomeDnaseSitesSKNSH.sql renamed as sites.sql which 
    # has an int and a float score field
    perl -pi.bak -e 's/SitesSKNSH/GM06990Sites/' sites.sql
    rm *.bak
    # scale scores to 0-1000. use linear transform.
    awk '{printf "%s\t%d\t%d\t%s\t%d\t%s\n", $1,$2,$3,$4,($5 * 105),$5}' \
        lab/Encode.DNase-Array-GM06990.DHSs.hg17.bed \
        | sort -k1,1 -k2,2n > linearScaledSites.bed
    hgLoadBed -sqlTable=sites.sql hg17 encodeRegulomeDnaseGM06990Sites \
        linearScaledSites.bed
    # authors provided track-description.html
    # Add this to trackDb/human/hg17 as encodeRegulomeDnaseArray.html
    # trackDb entry - track is renamed as encodeRegulomeDnaseArray
    # Previously track was called encodeRegulomeCap.
   
##########################################################################
# SANGER CHIP/CHIP  (2006-03-16 kate)
# Updated (2006-08-08 kate)
#  5 histone mods in HFL cells, to be added to existing track
#  Submitted by Rob Andrews
#  Data in two additional cell lines (MOLT4 and PTR8) submitted
#  8/8/08 by Rob -- 8 additional subtracks.
# Updated (DONE, 2007-01-09, hartera)
# 4 histone mods in GM06990 cells and data for CTCF antibody (CCCTC-binding
# factor (zinc finger protein)) also in GM06990 to be added to existing track.
# Total of 5 new subtracks submitted by Rob Andrews: rma@sanger.ac.uk
# Update of the above 5 new subtracks (DONE, 2007-01-18, hartera)
# The 2007-01-09 update had incorrect data. Corrected data was received 
# on 2007-01-18 for the five new subtracks and the track was updated. 
# New data was submitted by Christopher Koch: cmk@sanger.ac.uk
    ssh hgwdev
    cd /cluster/data/encode/sanger/chipchip
    mkdir -p 2006-03-16/lab
    cd 2006-03-16
    cp /var/ftp/encode/*.wig.txt lab

    grep "^chr" lab/H3K4me1_HFL-1_1.wig.txt | sort -k1,1 -k2,2n > \
	H3K4me1HFL1.bed
    grep "^chr" lab/H3K4me2_HFL-1_1.wig.txt | sort -k1,1 -k2,2n > \
	H3K4me2HFL1.bed
    grep "^chr" lab/H3K4me3_HFL-1_1.wig.txt | sort -k1,1 -k2,2n > \
	H3K4me3HFL1.bed
    grep "^chr" lab/H3ac_HFL-1_1.wig.txt | sort -k1,1 -k2,2n > \
	H3acHFL1.bed
    grep "^chr" lab/H4ac_HFL-1_1.wig.txt | sort -k1,1 -k2,2n > \
	H4acHFL1.bed
cat > load.csh << 'EOF'
    foreach f (*.bed)
        set t = $f:r
        echo $t
	hgLoadBed -bedGraph=4 hg17 encodeSangerChip$t $t.bed
    end
'EOF'
    csh load.csh >&! load.log &
    # loaded 23996 elements for 5 tables

    ssh hgwdev
    cd /cluster/data/encode/sanger/chipchip
    mkdir -p 2008-08-8/lab
    cd 2006-08-08
    cp /var/ftp/encode/*.wig.txt lab

    grep "^chr" lab/H3K4me1_PTR8_1.wig.txt | sort -k1,1 -k2,2n > \
	H3K4me1Ptr8.bed
    grep "^chr" lab/H3K4me2_PTR8_1.wig.txt | sort -k1,1 -k2,2n > \
	H3K4me2Ptr8.bed
    grep "^chr" lab/H3K4me3_PTR8_1.wig.txt | sort -k1,1 -k2,2n > \
	H3K4me3Ptr8.bed
    grep "^chr" lab/H3K4me1_MOLT4_1.wig.txt | sort -k1,1 -k2,2n > \
	H3K4me1Molt4.bed
    grep "^chr" lab/H3K4me2_MOLT4_1.wig.txt | sort -k1,1 -k2,2n > \
	H3K4me2Molt4.bed
    grep "^chr" lab/H3K4me3_MOLT4_1.wig.txt | sort -k1,1 -k2,2n > \
	H3K4me3Molt4.bed
    grep "^chr" lab/H3ac_MOLT4_1.wig.txt | sort -k1,1 -k2,2n > \
	H3acMolt4.bed
    grep "^chr" lab/H4ac_MOLT4_1.wig.txt | sort -k1,1 -k2,2n > \
	H4acMolt4.bed
cat > load.csh << 'EOF'
    foreach f (*.bed)
        set t = $f:r
        echo $t
	hgLoadBed -bedGraph=4 hg17 encodeSangerChip$t $t.bed
    end
'EOF'
    csh load.csh >&! load.log &
    # loaded 23983 elements for 8 tables

    # update by adding these additional 5 subtracks.
    # replaced the 5 new subtracks with corrected data (2007-01-18)
    ssh hgwdev
    cd /cluster/data/encode/sanger/chipchip
    mkdir -p 2007-01-18/lab
    rm latest
    ln -s /cluster/data/encode/sanger/chipchip/2007-01-18 \
      /cluster/data/encode/sanger/chipchip/latest
    cd latest
    mv /var/ftp/encode/*.wig.txt lab

    grep "^chr" lab/CTCF_GM06990_1.wig.txt | sort -k1,1 -k2,2n > \
         CTCF.bed
    grep "^chr" lab/H3K27me3_GM06990_1.wig.txt | sort -k1,1 -k2,2n > \
         H3K27me3.bed
    grep "^chr" lab/H3K36me3_GM06990_1.wig.txt | sort -k1,1 -k2,2n > \
         H3K36me3.bed
    grep "^chr" lab/H3K79me3_GM06990_1.wig.txt | sort -k1,1 -k2,2n > \
         H3K79me3.bed
    grep "^chr" lab/H3K9me3_GM06990_1.wig.txt | sort -k1,1 -k2,2n > \
         H3K9me3.bed
    # load data into hg17 database
cat > load.csh << 'EOF'
    foreach f (*.bed)
        set t = $f:r
        echo $t
	hgLoadBed -bedGraph=4 hg17 encodeSangerChip$t $t.bed
    end
'EOF'
    csh load.csh >&! load.log &
    # update trackDb/human/hg17/trackDb.encode.ra to add these new subtracks
    # to the encodeSangerChip track. update the encodeSangerChip.html to 
    # list the new antibodies used and links for them from the information on
    # the Sanger data access page:
    # http://www.sanger.ac.uk/PostGenomics/encode/data-access.shtml

#######################################################################
# ENCODE PSEUDOGENE TRACK (DONE, 2006-03-30, hartera)
    # Yontao reloaded the encodePseudogeneUcsc2 table with shorter 
    # names for the pseudogenes as they were cut off in the browser so now
    # NM_001017421|chr2|+|1 would be NM_001017421|1
    # The class table needs to be reloaded. Yontao provided a file:
    # encodePseudogeneUcsc2-forload.class
    # get a dump of the current table without the ucsc2 entries.
    ssh hgwdev
    cd /cluster/data/encode/pseudogene/class
    hgsql -N -e 'select * from encodePseudogeneClass where owner != "ucsc2";' \
          hg17 > encodePseudogeneClassNoUcsc2.txt
    cat encodePseudogeneClassNoUcsc2.txt encodePseudogeneUcsc2-forload.class \
        > allPseudogenesClass.txt
    sort -k3,3 allPseudogenesClass.txt > encodePseudogeneClass2.txt 
    # the consensus sequences have different names in the Class table, the
    # names had been changed to Vega gene names. Get the Class from the gtf in
    # /cluster/data/encode/pseudogene/consensus
    awk 'BEGIN {OFS="\t"} {print $10, $2}' \
         ../consensus/consensus.jan6.hg17.gtf | sort | uniq \
         > pgConsensus.class
    sed -e 's/VEGA_//' pgConsensus.class | sed -e 's/"//g' \
           | sed -e 's/;//' > pgConsensusClass.txt
    wc -l pgConsensusClass.txt
    # 201 pgConsensusClass.txt
    awk 'BEGIN {OFS="\t"}{print $0,"consensus"}' pgConsensusClass.txt | sort \
        > pgConsensusClassSorted.txt
    # reload the encodePseudogeneClass table
    hgsql -N -e 'select * from encodePseudogeneClass where owner != "ucsc2" and owner != "consensus";' \
          hg17 > pseudoClassNoUcsc2OrConsensus.txt
    cat pseudoClassNoUcsc2OrConsensus.txt encodePseudogeneUcsc2-forload.class \
        pgConsensusClassSorted.txt > allPseudogenesClass.txt
    sort -k3,3 allPseudogenesClass.txt > encodePseudogeneClass2.txt 
    wc -l encodePseudogeneClass2.txt
    # 995 encodePseudogeneClass2.txt
    # only 830 load as there are dupicate names - 165 names are shared
    # between the consensus and havana subtracks. These names
    # need to be unique as they are the primary key. Checked that the 
    # class is the same for havana and consensus subtracks where the 
    # name is the same so reload table with one entry for these genes.
    # remove havana and consensus pseudogenes
    grep -v havana allPseudogenesClass.txt | grep -v consensus \
         > pseudoNoHavananNoCons.txt
    wc -l pseudoNoHavananNoCons.txt
    # 616 pseudoNoHavananNoCons.txt
    # prepare consensus set not in havana
    awk 'BEGIN {OFS="\t"}{print $0,"consensus"}' consOnly > consOnlyWithOwner
    # prepare havana set not in consensus
    awk 'BEGIN {OFS="\t"}{print $0,"havana"}' havanaOnly2 > havanaOnlyWithOwner
    # prepare set common to consensus and havana
    awk 'BEGIN {OFS="\t"}{print $0,"havana or consensus"}' \
        nameAndClass.ConsAndHavana > havanaAndConsWithOwner
    wc -l *Owner
    # 36 consOnlyWithOwner
    # 165 havanaAndConsWithOwner
    # 2 havanaOnlyWithOwner
    cat pseudoNoHavananNoCons.txt consOnlyWithOwner havanaAndConsWithOwner \
        havanaOnlyWithOwner > allPseudogenesClass2.txt
    sort -k3,3 allPseudogenesClass2.txt > encodePseudogeneClass2.txt 
    wc -l encodePseudogeneClass2.txt
    # 819 encodePseudogeneClass2.txt
    # reload table
    hgsql -e 'drop table encodePseudogeneClass;' hg17
    hgsql hg17 < encodePseudogeneClass.sql
    echo "load data local infile 'encodePseudogeneClass2.txt' into \
         table encodePseudogeneClass" | hgsql hg17


##########################################################################
# Univ. Uppsala, Sweden Chip/chip (butyrate-treated H3Ac, H4Ac)
# Submitted 5/29/06 by Adam Ameur <mada@lcb.uu.se>
# 6 subtracks
#	DONE - 2006-06-13 - Hiram

    cd /cluster/data/encode/Uppsala
    mkdir -p 2006-05-09/lab
    cd 2006-05-09/lab
    unzip H3H4ac_butyrate.zip
    cat << '_EOF_' > splitTrack.pl
#!/usr/bin/env perl

use warnings;
use strict;

open (FH,"<H3H4ac_butyrate.tracks") or die "can not open H3H4ac_butyrate.tracks: $!";

my $trackCount = 1;
my $outFile="track1";

while(my $line=<FH>)
{
    if ($line =~ m/^track/) {
        $outFile = sprintf("track_%d", $trackCount++);
        open (OUT,">$outFile") or die "can not open $outFile: $!";
    } else {
            print OUT $line;
    }
}

close(OUT);

close (FH)
'_EOF_'
    # << emacs happy
    chmod +x splitTrack.pl
    ./splitTrack.pl
    #	looking at the track definitions to get some reasonable names:
    grep "^track" H3H4ac_butyrate.tracks
    mv track_1 encodeUppsalaChipH3acBut0h.wig.txt
    mv track_2 encodeUppsalaChipH3acBut12h.wig.txt
    mv track_3 encodeUppsalaChipH4acBut0h.wig.txt
    mv track_4 encodeUppsalaChipH4acBut12h.wig.txt
    mv track_5 encodeUppsalaChipH3acBut0vs12.itemRgb.txt
    mv track_6 encodeUppsalaChipH4acBut0vs12.itemRgb.txt

    #	encoding
    wigEncode encodeUppsalaChipH3acBut0h.wig.txt \
    	encodeUppsalaChipH3acBut0h.wig encodeUppsalaChipH3acBut0h.wib
    #	upper limit 15.68, lower limit 0.17
    wigEncode encodeUppsalaChipH3acBut12h.wig.txt \
	encodeUppsalaChipH3acBut12h.wig encodeUppsalaChipH3acBut12h.wib
    #	upper limit 6.55, lower limit 0.22
    wigEncode encodeUppsalaChipH4acBut0h.wig.txt \
	encodeUppsalaChipH4acBut0h.wig encodeUppsalaChipH4acBut0h.wib
    #	upper limit 14.47, lower limit 0.19
    wigEncode encodeUppsalaChipH4acBut12h.wig.txt \
	encodeUppsalaChipH4acBut12h.wig encodeUppsalaChipH4acBut12h.wib
    #	upper limit 6.58, lower limit 0.05

    mkdir /gbdb/hg17/encode/Uppsala
    ln -s `pwd`/*.wib /gbdb/hg17/encode/Uppsala/

    #	using the -tmpDir will cause the temp file to be removed
    hgLoadWiggle -tmpDir=/scratch/tmp hg17 encodeUppsalaChipH3acBut0h \
	-pathPrefix=/gbdb/hg17/encode/Uppsala encodeUppsalaChipH3acBut0h.wig
    hgLoadWiggle -tmpDir=/scratch/tmp hg17 encodeUppsalaChipH3acBut12h \
	-pathPrefix=/gbdb/hg17/encode/Uppsala encodeUppsalaChipH3acBut12h.wig
    hgLoadWiggle -tmpDir=/scratch/tmp hg17 encodeUppsalaChipH4acBut0h \
	-pathPrefix=/gbdb/hg17/encode/Uppsala encodeUppsalaChipH4acBut0h.wig
    hgLoadWiggle -tmpDir=/scratch/tmp hg17 encodeUppsalaChipH4acBut12h \
	-pathPrefix=/gbdb/hg17/encode/Uppsala encodeUppsalaChipH4acBut12h.wig


    #	they don't have their score data normalized, find min, max, etc...
    ave -col=5 encodeUppsalaChipH3acBut0vs12.itemRgb.txt
    #	min 0.404995
    #	max 7.091458
    # -> max - min = 6.686463
    echo "7.091458 - 0.404995" | bc
    #	6.686463
   
    #	plugging in those numbers, normalize the score column
    #	0.000001 from the min value to avoid -0 in the output
    awk '
    {
    score = 1000.0*($5 - 0.404994)/6.686463
    for (i=1; i < 5; ++i) { printf "%s\t", $i }
    printf "%d\t", score
    for (i=6; i < 9; ++i) { printf "%s\t", $i }
    printf "%s\n", $9
    } ' encodeUppsalaChipH3acBut0vs12.itemRgb.txt | \
	hgLoadBed -tmpDir=/scratch/tmp -strict hg17 \
	    encodeUppsalaChipH3acBut0vs12 stdin

    #	using the -tmpDir will cause the temp file to be removed

    #	same deal for the other one
    ave -col=5 encodeUppsalaChipH4acBut0vs12.itemRgb.txt
    #	min 0.347273
    #	max 2.833333
    echo "2.833333 - 0.347273" | bc
    #	2.486060
    #	plugging in those numbers, normalize the score column
    #	0.000001 from the min value to avoid -0 in the output
    awk '
    {
    score = 1000.0*($5 - 0.347272)/2.486060
    for (i=1; i < 5; ++i) { printf "%s\t", $i }
    printf "%d\t", score
    for (i=6; i < 9; ++i) { printf "%s\t", $i }
    printf "%s\n", $9
    } ' encodeUppsalaChipH4acBut0vs12.itemRgb.txt | \
	hgLoadBed -tmpDir=/scratch/tmp -strict hg17 \
	    encodeUppsalaChipH4acBut0vs12 stdin

    #	To see what would be reasonable view limits, look at these
    #	histograms and see where the majority of the data is
    hgWiggle -doHistogram -hBinSize=0.16 -hBinCount=100 -hMinVal=0.0 \
	-db=hg17 encodeUppsalaChipH3acBut0h
    #	running each of the wiggle tracks, it looks like 95% of the data
    #	is in the region 0 to 2.0


##########################################################################
# UW/Regulome QCP data
# To replace existing tracks
# Submitted 5/19/06 by John Stam <jstam@U.WASHINGTON.EDU>
# 1 zip file data: UW_may06_ENCODE_data, plus Description.doc

    cd /cluster/data/encode/Regulome
    mkdir -p 2006-05-19/lab
    cd 2006-05-19/lab
    # deposit data
    mkdir data
    cd data
    unzip ../*.zip
    cd ..
    ls data
# CD4.baseline.hg17.bed       HMEC.baseline.hg17.bed   NHBE.baseline.hg17.bed
# CaCo2.baseline.hg17.bed     HRE.baseline.hg17.bed    PANC.baseline.hg17.bed
# CaLU3.baseline.hg17.bed     HeLa.baseline.hg17.bed   SAEC.baseline.hg17.bed
# EryAdult.baseline.hg17.bed  HepG2.baseline.hg17.bed  SKnSH.baseline.hg17.bed
# EryFetal.baseline.hg17.bed  Huh7.baseline.hg17.bed
# GM.baseline.hg17.bed        K562.baseline.hg17.bed

    #	loading bedGraph 5 data type:
    for CELL in CD4 CaCo2 CaLU3 EryAdult EryFetal GM HMEC HRE HeLa HepG2 \
        Huh7 K562 NHBE PANC SAEC SKnSH
    do
	sort -k1,1 -k2,2n data/$CELL.baseline.hg17.bed \
	    | /cluster/data/encode/bin/scripts/trimOverlap.pl \
		| hgLoadBed -noSort -noBin -strict -bedGraph=5 hg17 \
		    encodeUWRegulomeBase$CELL stdin
    done

    #	gross statistics for the data
    awk '{print $5}' data/*.baseline.hg17.bed | ave stdin
    #	Q1 -0.201029
    #	median 0.000000
    #	Q3 0.207335
    #	average 0.018950
    #	min -5.454980
    #	max 6.327273
    #	count 291642
    #	total 5526.507662
    #	standard deviation 0.489442

    #	a histogram of the data:
    awk '{print $5}' data/* | textHistogram -verbose=2 -binSize=0.12 \
	-maxBinCount=100 -minVal=-5.5 -real -pValues stdin \
	    > histogram.data
    #	looking at that, it appers that 95% of the data is within the
    #	range of -1.0 to 1.0
    #	The note that came with this data said to set view limits
    #	at 0.5 : 3.0

    #	Making the trackDb entries, taking colors from:
    #	http://genome-test.cse.ucsc.edu/~hiram/rgbItemExamples.html

    rm -f trackDb.entries.txt
    I=1
    export I
    for CELL in CD4 CaCo2 CaLU3 EryAdult EryFetal GM HMEC HRE HeLa HepG2 \
        Huh7 K562 NHBE PANC SAEC SKnSH
    do
	echo "    track encodeUWRegulomeBase${CELL}"
	echo "    subTrack encodeUWRegulomeBase"
	echo "    shortLabel ${CELL}"
	echo "    longLabel ${CELL} DNaseI Sensitivity"
	case $I in
	    1) echo "    color 0,0,255";;
	    2) echo "    color 0,48,224";;
	    3) echo "    color 0,96,176";;
	    4) echo "    color 0,119,153";;
	    5) echo "    color 0,153,119";;
	    6) echo "    color 0,187,85";;
	    7) echo "    color 56,238,0";;
	    8) echo "    color 0,255,0";;
	    9) echo "    color 68,238,0";;
	    10) echo "    color 96,192,326";;
	    11) echo "    color 136,170,0";;
	    12) echo "    color 170,136,0";;
	    13) echo "    color 204,102,0";;
	    14) echo "    color 238,68,0";;
	    15) echo "    color 255,0,0";;
	    16) echo "    color 255,0,255";;
	esac
	echo "    priority ${I}"
	echo
	I=`expr $I + 1`
    done > trackDb.entries.txt


##########################################################################
# EBI PECAN Alignments (IN PROGRESS 2006-06-22 kate)
#  From Ben Paten

    cd /cluster/data/encode
    mkdir -p PECAN/SEP-05/lab
    cd PECAN/SEP-05/lab
    wget http://www.ebi.ac.uk/~bjp/pecan/encode_sept_pecan_mafs.tar.bz2
    bunzip2 encode_sept_pecan_mafs.tar.bz2
    tar xvf encode_sept_pecan_mafs.tar
    cd ..


cat > project.csh << 'EOF'
    mkdir -p tmp
    set tmpDir = tmp
    foreach f (lab/*MAF/EN[mr]*)
        set r = $f:t:r
        echo $r
        set c = `echo "SELECT chrom from encodeRegions WHERE name='$r'" | \
                        hgsql -N hg17`
        set start =  \
                `echo "SELECT chromStart from encodeRegions WHERE name='$r'" | \
                        hgsql -N hg17`
        set size = \
                `echo "SELECT size from chromInfo WHERE chrom='$c'" | \
                        hgsql -N hg17`
        /cluster/data/encode/bin/scripts/mafCoord.pl < $f \
                human.0 hg17.$c $start $size | \
            sed 's/^a$/a score=0.0/' > $tmpDir/$r.db.maf
        echo "projecting $r"
        /cluster/bin/penn/maf_project $tmpDir/$r.db.maf hg17.$c > $r.maf
        echo "finished $r"
    end
'EOF'
    csh project.csh >&! project.log &
    rm -fr tmp
    set gdir = /gbdb/hg17/encode/PECAN/SEP-05/maf
    mkdir -p $gdir
    rm -f $gdir/*.maf
    ln -s /cluster/data/encode/PECAN/SEP-05/*.maf $gdir
    hgLoadMaf -pathPrefix=$gdir -WARN hg17 encodePecanAlign >&! load.log
    # lots of "score too small" messages -- these are OK.
    cat *.maf | hgLoadMafSummary hg17 encodePecanSummary stdin



##########################################################################
# UW/Regulome QCP data again 2006-07-05 - Hiram

    ssh hgwdev
    cd /cluster/data/encode/Regulome/2006-06-13/lab

    for CELL in CD4 CaCo2 CaLU3 EryAdult EryFetal GM HMEC HRE HeLa HepG2 \
        Huh7 K562 NHBE PANC SAEC SKnSH
    do
	ls -og ${CELL}.normalized_060206.hg17.bed
	sort -k1,1 -k2,2n ${CELL}.normalized_060206.hg17.bed \
	    | /cluster/data/encode/bin/scripts/trimOverlap.pl \
		| hgLoadBed -noSort -noBin -strict -bedGraph=5 hg17 \
		    encodeUWRegulomeBase${CELL} stdin
    done

    #	gross statistics for the data
    awk '{print $5}' *.normalized_060206.hg17.bed | ave stdin
    #	Q1 -0.494604
    #	median -0.000000
    #	Q3 0.510167
    #	average 0.046097
    #	min -13.409856
    #	max 15.554195
    #	count 297650
    #	total 13720.736560
    #	standard deviation 1.203400

    #	calculate histogram 100 bin size:
    echo -13.5 15.6 | awk '{print ($2-$1)/100}'
    #	0.291

    #	a histogram of the data:
    awk '{print $5}' *.normalized_060206.hg17.bed | \
	textHistogram -verbose=2 -binSize=0.292 \
	-maxBinCount=100 -minVal=-13.5 -real -pValues stdin \
	    > histogram.data
    #	looks like the majority of the data is within -1.0 to 1.0

    #	The trackDb entries made previously should be OK

############################################################
# DLESS acs 05/02/06
# sorry this is a bit sketchy.  See me with questions

    cd /cluster/home/acs/encode-dless/hg17

    # make tree model
    tree_doctor /cluster/home/acs/DLESS-CSHL/encode17.mod --rename "human->hg17" > tree.mod
    tree_doctor --tree-only tree.mod > tree.nh

    # make SS files, annotated with indels by Brian
    cat > prepAlignmentsIndels.sh <<EOF
#!/bin/sh

TARGET=$1
CHR=$2
START=$3
END=$4

/cluster/bin/phast/msa_view /cluster/bluearc/encode/TBA/SEP-05/maf-indels/human.${TARGET}.maf -i MAF --refseq /cluster/bluearc/hg17/chrom/${CHR}.fa -o SS --start $START --end $END --refidx 1 > /scratch/${TARGET}.sso

/cluster/bin/phast/msa_view /scratch/$TARGET.sso -i SS -o SS --seqs hg17,chimp,baboon,macaque,marmoset,galago,rat,mouse,rabbit,cow,dog,rfbat,armadillo,elephant,tenrec,monodelphis,platypus --gap-strip ALL | /cluster/home/acs/phast-opteron/bin/msa_view - -i SS -o SS --order  hg17,chimp,baboon,macaque,marmoset,galago,rat,mouse,rabbit,cow,dog,rfbat,armadillo,elephant,tenrec,monodelphis,platypus > /cluster/bluearc/encode/TBA/SEP-05/ss-indels/${TARGET}.sso
# second call adds rows of missing data for missing species

rm /scratch/$TARGET.sso
EOF
    chmod +x prepAlignmentsIndels.sh

    mkdir -p /cluster/bluearc/encode/TBA/SEP-05/maf-indels /cluster/bluearc/encode/TBA/SEP-05/ss-indels
    rsync -avz /cluster/store11/encodeMafAnno/TBA/APR-26/human.*.maf /cluster/bluearc/encode/TBA/SEP-05/maf-indels   # location of Brian's files
    hgsql hg17 -e "select * from encodeRegions" --skip-column-names > regions.txt
    awk '{printf "prepAlignmentsIndels.sh %s %s %s %s\n", $1, $2, $3, $4}' regions.txt > jobList7  # never mind numbering; there were some other experimental runs that I've omitted
    # para create, para push, etc.

    # get indel histories and estimate indel params
    mkdir -p consElements
    awk '{printf "select chrom, chromStart - %d + 1, chromEnd - %d + 1 from encodeTbaPhastConsEl where chrom = \"%s\" and chromStart >= %d and chromEnd <= %d\n", $3, $3, $2, $3, $4 > $1 ".sql"}' regions.txt 
    for file in *.sql ; do hgsql hg17 --skip-column-names < $file > consElements/`basename $file .sql`.bed ; done
    rm *.sql

    cat > indelHistoryParsBrian.sh <<EOF
#!/bin/sh

TARGET=$1
/cluster/bin/phast/indelHistory /cluster/bluearc/encode/TBA/SEP-05/ss-indels/$TARGET.sso tree.nh -i SS > /cluster/bluearc/encode/DLESS/IH-indels/$TARGET.pars.ih
EOF
    chmod +x indelHistoryParsBrian.sh

    rm -f jobList8
    mkdir -p /cluster/bluearc/encode/DLESS/IH-indels
    awk '{print $1}' regions.txt > targets
    for t in `cat targets` ; do echo "indelHistoryParsBrian.sh $t" >> jobList8 ; done
    # para create, para push, etc.

    cat > indelModelsBrian.sh <<EOF
#!/bin/sh

TARGET=$1
/cluster/bin/phast/indelFit /cluster/bluearc/encode/DLESS/IH-indels/$TARGET.pars.ih tree.nh --features consElements/$TARGET.bed --reference hg17 > IM-indels/$TARGET.pars.im
EOF
    chmod +x indelModelsBrian.sh

    rm -f jobList9
    mkdir -p IM-indels
    for t in `cat targets` ; do echo "indelModelsBrian.sh $t" >> jobList9 ; done
    # para create, para push, etc.

    # average estimates across targets
    sed 's/,//g' IM-indels/*.pars.im | awk '{if ($2 == 0) {nbg++; a_bg += $6; b_bg += $9; t_bg += $12} else if ($2 == 1) {nco++; a_co += $6; b_co += $9; t_co += $12}} END {printf "bg: alpha = %f, beta = %f, tau = %f\nco: alpha = %f, beta = %f, tau = %f\n", a_bg/nbg, b_bg/nbg, t_bg/nbg, a_co/nco, b_co/nco, t_co/nco}' > ave.pars.brian.im
    #bg: alpha = 0.033417, beta = 0.053284, tau = 0.052852
    #co: alpha = 0.011655, beta = 0.020610, tau = 0.065395

    # now estimate DLESS params by ML
    cat > doDlessEstimateParsBrian.sh <<EOF
#!/bin/sh

TARGET=$1
CHR=$2
/cluster/bin/phast/dless /cluster/bluearc/encode/TBA/SEP-05/ss-indels/$TARGET.sso tree.mod --expected-length 20 --target-coverage ~0.2 -i SS --seqname $CHR --idpref $TARGET --indel-model 0.033417,0.053284,0.052852,0.011655,0.020610,0.065395 --phi ~0.5 --indel-history /cluster/bluearc/encode/DLESS/IH-indels/$TARGET.pars.ih 1> /cluster/bluearc/encode/DLESS/ESTIMATE-indels/$TARGET.pars.gff 2> /cluster/bluearc/encode/DLESS/ESTIMATE-indels/$TARGET.pars.stderr
EOF
    chmod +x doDlessEstimateParsBrian.sh

    mkdir -p /cluster/bluearc/encode/DLESS/ESTIMATE-indels
    awk '{printf "doDlessEstimateParsBrian.sh %s %s\n", $1, $2}' regions.txt > jobList10
    # para create, para push, etc.

    # average estimates across targets
    rm -f estimates.pars.brian.txt 
    grep '^Done' -l /cluster/bluearc/encode/DLESS/ESTIMATE-indels/*.pars.stderr > tmp1
    for file in `cat tmp1` ; do tail -9 $file | head -1 | awk '{print $2, $3}'>> estimates.pars.brian.txt ; done
    awk '{x += $1; y += $2} END {print "Pars:", x/NR, y/NR}' estimates.pars.brian.txt
    rm tmp1
    # Pars: 0.0551889 0.261488

    # predict elements
    cat > doDlessPars.sh <<EOF
#!/usr/local/bin/bash -e

TARGET=$1
CHR=$2
/cluster/bin/phast/dless /cluster/bluearc/encode/TBA/SEP-05/ss-indels/$TARGET.sso tree.mod --expected-length 20 --target-coverage 0.055 --phi 0.261 -i SS --seqname $CHR --idpref $TARGET --indel-model 0.033417,0.053284,0.052852,0.011655,0.020610,0.065395 --indel-history /cluster/bluearc/encode/DLESS/IH-indels/$TARGET.pars.ih 1> /cluster/bluearc/encode/DLESS/GFF/$TARGET.pars.gff 2> /cluster/bluearc/encode/DLESS/STDERR/$TARGET.pars.stderr
EOF
    chmod +x doDlessPars.sh

    mkdir -p /cluster/bluearc/encode/DLESS/GFF /cluster/bluearc/encode/DLESS/STDERR
    awk '{printf "doDlessPars.sh %s %s\n", $1, $2}' regions.txt > jobList4
    # para create, para push, etc.

    # compute P-values with phyloP
    cat > doGeneric.sh <<EOF
#!/usr/local/bin/bash

echo $1 ${*:3} '>' $2
$1 ${*:3} > $2
EOF
    chmod +x doGeneric.sh

    mkdir -p /cluster/bluearc/encode/DLESS/DLESSP
    rm -f jobList5
    for t in `cat targets` ; do \
            echo "./doGeneric.sh /cluster/bin/phast/dlessP /cluster/bluearc/encode/DLESS/DLESSP/$t.pars.dlessP /cluster/bluearc/encode/TBA/SEP-05/ss-indels/$t.sso -i SS /cluster/home/acs/encode-dless/hg17/tree.mod /cluster/bluearc/encode/DLESS/GFF/$t.pars.gff" >> jobList5 ;\
    done
    # para create, para push, etc.

    # load track
    echo "drop table if exists encodeDless" | hgsql hg17
    cat /cluster/bluearc/encode/DLESS/DLESSP/*.pars.dlessP | grep -v '^#' | sort -k1,1 -k2,2n | awk 'NF == 24' |sed 's/hg17/human/'  > dless.dat
    awk '{ if (($6 == "conserved" && $8 < 0.05) || ($6 == "gain" && $8 < 0.05 && $9 > 0.05 && $10 < 0.05) || ($6 == "loss" && $8 > 0.05 && $9 < 0.05 && $11 < 0.05)) print $0}' dless.dat > dless.filtered.dat
    sed 's/dless/encodeDless/g' ~/kent/src/hg/lib/dless.sql | hgsql hg17
    echo "load data local infile 'dless.filtered.dat' into table encodeDless" | hgsql hg17

#######################################################################
# YALE RFBR (Regulatory Factor Binding Regions) DATA 
# (DONE, 2006-11-01-2006-11-13, hartera)
# Data provided by Mark Gerstein's lab at Yale.
# Contact: zhengdong.zhang@yale.edu 
# E-mail from Zhengdong on Nov. 20, 2006 to say that currently there is a 
# discussion among some ENCODE people if DHS sites should be included
# when the clusters and deserts are generated. Therefore, it should not be 
# released to the public site yet.   
# Approved for release to public site by Zhengdong Zhang on 2007-01-24.
    ssh hgwdev
    mkdir /cluster/data/encode/yale/rfbr
    # move data from ftp site
    mv /var/ftp/encode/encode-tf-clusters-deserts.zip \
       /cluster/data/encode/yale/rfbr
    cd /cluster/data/encode/yale/rfbr
    unzip encode-tf-clusters-deserts.zip 
    # 3 files:
# encode-tf-clusters-deserts-description.txt
# encode-tf-clusters.bed
# encode-tf-deserts.bed
   # remove the URL from the name field and leave the just the accession 
   sed -e \
      's/http\:\/\/dart\.gersteinlab\.org\/cgi\-bin\/ar\/lookup\.cgi?acc=//' \
      encode-tf-clusters.bed > clusters.bed
   
   sed -e \
      's/http\:\/\/dart\.gersteinlab\.org\/cgi\-bin\/ar\/lookup\.cgi?acc=//' \
      encode-tf-deserts.bed > deserts.bed
   
   # load data files as BED files
   hgLoadBed hg17 encodeYaleChipRfbrClusters clusters.bed
   hgLoadBed hg17 encodeYaleChipRfbrDeserts deserts.bed
   
   # add description page
   cp encode-tf-clusters-deserts-description.txt \
      ~/kent/src/hg/makeDb/trackDb/human/hg17/encodeYaleChipRfbr.html
   # edit description to add extra method details requested from contributor
# Add trackDb entry to trackDb.encode.ra for hg17. see encodeYaleAffyRNATars
# and use the same url and urlLabel and dataVersion is Dec 2005 here. 
# that is the contributors data freeze, not an ENCODE one.


################################################################
# EvoFold ENCODE track, Jakob Skou Pedersen 12.03.2006 

# This is an update of the existing TBA23 EvoFold track (table:
# encode_tba23EvoFold), but a new track is made because of naming,
# etc.

# These are one time predictions accompanying the encode paper, so we
# start by fetching the data from the web and modify the element
# scores and names.

ssh hgwdev
cd  /cluster/data/encode/Evofold
wget http://www.soe.ucsc.edu/~jsp/encFolds/bed/nativeTop100perc.bed
cat nativeTop100perc.bed | awk 'BEGIN{OFS="\t"} {$5=int(100*$5); $4=$4 "_" $6 "_" $5; print}' > encodeEvoFold.bed

# encodeEvoFold.bed is a 9-column bed file: column 1-6 contain standard
# information, column 7 is element length, column 8 is the RNA
# secondary structure in parentheses format, and column is9 a
# commaseparated list of position specific confidence scores (floats).

cat /cluster/home/jsp/prog/kent/src/hg/lib/evofold.sql | sed -e 's/evofold/encodeEvoFold/' > tmp.sql
hgLoadBed -notItemRgb -sqlTable=tmp.sql hg17 encodeEvoFold encodeEvoFold.bed

###########################################################################
# UVienna RNA structure data (DONE, 2007-01-03 - 2007-01-04, hartera)
# Submitted 2006-12-15 by Stefan Washietl at the University of Vienna
# 
ssh hgwdev
mkdir -p /cluster/data/encode/UVienna/2006-12-15/lab
cd /cluster/data/encode/UVienna/
ln -s /cluster/data/encode/UVienna/2006-12-15 \
      /cluster/data/encode/UVienna/latest

# copy rnaz.bed and rnaz_track.html to /cluster/data/encode/UVienna/latest/lab
cd /cluster/data/encode/UVienna/latest/lab
# remove header line
tail +2 rnaz.bed > rnazNoHeader.bed
# load the data in rnaz.bed into the hg17 database
hgLoadBed -notItemRgb -strict hg17 encodeUViennaRnaz rnazNoHeader.bed
# Reading rnazNoHeader.bed
# Loaded 3707 elements of size 4
# Sorted

# then add trackDb.ra entry in trackDb/human/hg17/trackDb.encode.ra
# and add the description.
cp rnaz_track.html \
   ~/kent/src/hg/makeDb/trackDb/human/hg17/encodeUViennaRnaz.html
# edit description and change re-format publications


############################################################################
# Gencode Loci RACEfrags - 5' RACE-ARRAY experiments on Gencode loci
# Submitted on 2007-04-11 by France Denoeud: fdenoeud@genoscope.cns.fr
# of the Gencode group.
# Create a directory for the data and copy the data there. Data was 
# sent by e-mail. (DONE, hartera, 2007-04-25)

mkdir -p /cluster/data/encode/GencodeRACEfrags/2007-04-11/lab
# Data is in file: RACEFRAGS_UCSC.gff
# Also add the e-mails from France as a README.txt - these explain how the 
# track should be displayed. It was decided that the BAC end pairs display
# should be used where the arrows between blocks have no lines through them.
# README.txt is in 2007-04-11 directory.
cd /cluster/data/encode/GencodeRACEfrags/
ln -s 2007-04-11 latest
cd 2007-04-11
# 16 subtracks:
# 5' RACE Primers and 15 other cell/tissue types called RACEfrags from $2
# where $2 is cell/tissue in column 2.
# Sometimes there are overlapping "exons".
awk '{print $2;}' lab/RACEFRAGS_UCSC.gff | sort | uniq
# list of primer and cell/tissue types:
# 5RACE_primer
# Brain
# Colon
# GM06990
# HL60
# HeLa
# Heart
# Kidney
# Liver
# Lung
# Muscle
# Placenta
# Small-Intest
# Spleen
# Stomach
# Testis

# change 5RACE_primer to just Primer:
awk '{print $2;}' lab/RACEFRAGS_UCSC.gff | sort | uniq > subtracks.txt

# 5' RACE was performed on about 400 ENCODE genes and the RACE products were
# pooled together (in 5 pools of about 80 genes each) for hybridization on the
# tiling arrays. There were a few pooling erros therefore sometimes a gene
# product from tissue A is sometimes in pool1 and the product from the same
# gene in tissue B is in pool 2 or there are RACEfrags for the same gene in
# the same tissue that came from different pools (France Denoeud, 2007-04-17)
# create GFF files for loading, change 3rd column to CDS so ldHgGene loads
# tables correctly.
foreach s (`cat subtracks.txt`)
   echo $s
   set t = $s
   if ($s == "5RACE_primer") then
      set t = "Primer"
   else if ($s == "HeLa") then
      set t = "Hela"
   else if ($s == "Small-Intest") then 
      set t = "SmallIntest"
   endif
   grep $s lab/RACEFRAGS_UCSC.gff > encodeGencodeRaceFrags${t}.gff
   if (t == "Primer") then
     perl -pi.bak -e 's/(\s+)primer/$1CDS/' encodeGencodeRaceFrags${t}.gff
   else
     perl -pi.bak -e 's/racefrag_pool[0-9]+/CDS/' encodeGencodeRaceFrags${t}.gff  
   endif
end

# Load the gff files for the subtracks into the database:
cat << 'EOF' > load.csh
foreach f (*.gff)
    set table = $f:r
    echo $table
    ldHgGene hg17 $table $f
end
'EOF'

chmod +x load.csh
csh load.csh >&! load.log
rm *.bak load.log

# Loading program will merge overlapping RACEfrags to show one "exon".
# Added trackDb track and hgFindSpec search entries to trackDb.ra.
# Reordered tracks in trackDb/trackDb.encode.ra and edited description 
# (2007-04-25, hartera).

# The encodeGencodeRaceFragsPrimer.gff contains a "." insteand of a strand
# so add + for the strand. (2007-05-10, hartera)

hgsql -e 'update encodeGencodeRaceFragsPrimer set strand = "+";' hg17

# Added trackDb.encode.ra setting:
# autoTranslate 0
# to remove protein translation on details page as it does not make sense
# to have it for this track. Also added code to hgTrackUi.c so that the 
# genePred track configurations controls for selecting the item label and 
# codon coloring are not drawn as they are not applicable either for this 
# track.

# Download custom files from France made available May 15, 2007.
# (hartera, 2007-05-22)
cd /cluster/data/encode/GencodeRACEfrags/2007-04-11
mkdir custom
cd custom
wget --timestamping \
http://genome.imim.es/~jlagarde/tmp/racefrags_customfiles.tgz
gunzip racefrags_customfiles.tgz 
tar -xvf racefrags_customfiles.tar

# On 2007-05-29, France sent the table that contains the links to the custom
# track files above. We need to host this on our server too. A link on the
# track description page should lead to the table. 
# Download table (Sequences_Description.html) from e-mail to
# /cluster/data/encode/GencodeRACEFrags/2007-04-11/custom/

# make directory for the custom tracks in the ENCODE datafiles directory
cd /usr/local/apache/htdocs/goldenPath/hg17/encode/datafiles/
mkdir -p GencodeRACEfrags
cd GencodeRACEfrags
ln -s /cluster/data/encode/GencodeRACEfrags/2007-04-11/custom 2007-04-11

cd /cluster/data/encode/GencodeRACEfrags/2007-04-11/custom
# change the links in the table for the Sequence_Description.html
# http://hgdownload.cse.ucsc.edu/goldenPath/hg17/encode/datafiles/GencodeRACEfrags/2007-04-11/custom_file_*
sed -e
's/genome\.imim\.es\/%7Efdenoeud/hgdownload\.cse\.ucsc\.edu\/goldenPath\/hg17\/encode\/datafiles\/GencodeRACEfrags\/2007\-04\-11/g' Sequences_Description.html \
> raceFragSequencesTable.html
# Change link in the encodeGencodeRaceFrags.html description page to point
# to the table on the hgdownloads server.
# Table and custom tracks pushed to hgdownloads on 2007-06-01.

###########################################################################
# Affy EC chrom21/chrom22 RELOAD (Andy 2008-03-20)

ssh hgwdev
bash
cd /cluster/data/encode/Affy
mkdir -p 2008-03-20/{lab,processed}
cd 2008-03-20/lab/
cp /var/ftp/encode/encode_ext_RNA_hg17_chr21-22.tar .
tar xf encode_ext_RNA_hg17_chr21-22.tar
find . -name '*bz2' -exec bunzip2 '{}' \;
mkdir -p processed/hg17/{download,bed}
for f in lab/BW0/bed/*; do
   tiss=`echo $f | sed 's/.*\///;s/\.bed//'`;
   newF=processed/hg17/bed/encodeAffyEc1${tiss}Sites.bed;
   tail +2 $f > $newF;
done
for f in lab/BW25/bed/*; do
   tiss=`echo $f | sed 's/.*\///;s/\.bed//'`;
   newF=processed/hg17/bed/encodeAffyEc51${tiss}Sites.bed;
   tail +2 $f > $newF;
done
cd processed/hg17/bed/
for bed in *.bed; do hgLoadBed hg17 ${bed%.bed} $bed; done
cd ../../
# beds loaded, now for the wiggles...
for f in lab/BW0/wig/*; do
   tiss=`echo $f | sed 's/.*\///;s/\.sig.wig//'`;
   table=encodeAffyEc1${tiss}Signal
   downDir=processed/hg17/download
   wig=processed/hg17/wigTable/${table}.tab
   wib=processed/hg17/wib/${table}.wib
   zip=${downDir}/${table}.wigVar.gz
   tail +2 $f | gzip -c > $zip
   wigEncode $zip $wig $wib 2>> processed/hg17/wigEncode.log
   pushd /gbdb/hg17/encode/wib
   ln -s `dirs -1`/$wib
   popd
   hgLoadWiggle -pathPrefix=/gbdb/hg17/encode/wib hg17 $table $wig
done
for f in lab/BW25/wig/*; do
   tiss=`echo $f | sed 's/.*\///;s/\.sig.wig//'`;
   table=encodeAffyEc51${tiss}Signal
   downDir=processed/hg17/download
   wig=processed/hg17/wigTable/${table}.tab
   wib=processed/hg17/wib/${table}.wib
   zip=${downDir}/${table}.wigVar.gz
   tail +2 $f | gzip -c > $zip
   wigEncode $zip $wig $wib 2>> processed/hg17/wigEncode.log
   pushd /gbdb/hg17/encode/wib
   ln -s `dirs -1`/$wib
   popd
   hgLoadWiggle -pathPrefix=/gbdb/hg17/encode/wib hg17 $table $wig
done
mkdir /data/apache/htdocs/goldenPath/hg17/encode/downloads
cd /data/apache/htdocs/goldenPath/hg17/encode/downloads
ln -s /cluster/data/encode/Affy/2008-03-20/processed/hg17/download/*.gz .
# found out these have bad headers.  I'm going to reduce the span
# from 25 to 1.  
cd /cluster/data/encode/Affy/2008-03-20
for f in lab/BW0/wig/*; do
   tiss=`echo $f | sed 's/.*\///;s/\.sig.wig//'`;
   table=encodeAffyEc1${tiss}Signal
   downDir=processed/hg17/download
   wig=processed/hg17/wigTable/${table}.tab
   wib=${table}.wib
   zip=${downDir}/${table}.wigVar.gz
   tail +2 $f | sed 's/span=25/span=1/' | gzip -c > $zip
   wigEncode $zip $wig $wib 2>> processed/hg17/wigEncode.log
   mv $wib processed/hg17/wib
   hgLoadWiggle -pathPrefix=/gbdb/hg17/encode/wib hg17 $table $wig
done
for f in lab/BW25/wig/*; do
   tiss=`echo $f | sed 's/.*\///;s/\.sig.wig//'`;
   table=encodeAffyEc51${tiss}Signal
   downDir=processed/hg17/download
   wig=processed/hg17/wigTable/${table}.tab
   wib=${table}.wib
   zip=${downDir}/${table}.wigVar.gz
   tail +2 $f | sed 's/span=25/span=1/' | gzip -c > $zip
   wigEncode $zip $wig $wib 2>> processed/hg17/wigEncode.log
   mv $wib processed/hg17/wib/
   hgLoadWiggle -pathPrefix=/gbdb/hg17/encode/wib hg17 $table $wig
done

# Renames (2008-04-14 Andy)

# first GM06690 -> GM06990
ssh hgwdev
cd /cluster/data/encode/Affy/2008-03-20/processed
for f in `find . -name '*GM0*'`; do newF=`echo $f | sed 's/6690/6990/'`; mv $f $newF; done
for f in `find . -name '*GM06990*.tab'`; do sed 's/GM06690/GM06990/' $f > tmp; mv tmp $f; done
rm /gbdb/hg1{7,8}/encode/wib/encodeAffyEc{5,}1GM06690Signal.wib
pushd /gbdb/hg17/encode/wib
ln -s `dirs +1`/hg17/wib/encodeAffyEc{5,}1GM06990Signal.wib .
popd
pushd /gbdb/hg18/encode/wib
ln -s `dirs +1`/hg18/wib/encodeAffyEc{5,}1GM06990Signal.wib .
popd
pushd /usr/local/apache/htdocs/goldenPath/hg17/encode/wig/
rm encodeAffyEc*GM0*
ln -s `dirs +1`/hg17/download/encodeAffyEc{5,}1GM06990Signal.wigVar.gz .
cd ../../../hg18/encode/wig
rm encodeAffyEc*GM0*
ln -s `dirs +1`/hg18/wigVar/encodeAffyEc{5,}1GM06990Signal.wigVar.gz .
for db in hg1{7,8}; do
   cd $db/wigTable
   for table in encodeAffyEc{5,}1GM06990Signal; do
       hgLoadWiggle -pathPrefix=/gbdb/${db}/encode/wib $db $table ${table}.tab
   done
   cd ../../
done
for db in hg1{7,8}; do 
   for table in `echo show tables like \'encodeAffyEc%GM066%\' | hgsql $db | tail +2`; do
      echo drop table $table | hgsql $db;
   done
done
for db in hg1{7,8}; do 
   for table in encodeAffyEc{5,}1GM06990Sites; do
      hgLoadBed $db $table ${db}/bed/${table}.bed
   done
done

# now Testes -> Testis
# Delete
rm /gbdb/hg1{7,8}/encode/wib/encodeAffyEc*Testes*.wib
rm /usr/local/apache/htdocs/goldenPath/hg1{7,8}/encode/wig/encodeAffyEc*Testes*.wigVar.gz
for db in hg1{7,8}; do
   for table in `echo show tables like \'encodeAffyEc%Testes%\' | \
       hgsql $db | tail +2`; do 
     echo drop table $table | hgsql $db; 
   done; 
done

# Change
for f in `find . -name '*Testes*'`; do newF=`echo $f | sed 's/Testes/Testis/'`; mv $f $newF; done
for f in `find . -name '*Testis*.tab'`; do sed 's/Testes/Testis/' $f > tmp; mv tmp $f; done

# add links
pushd /gbdb/hg17/encode/wib
ln -s /cluster/data/encode/Affy/2008-03-20/processed/hg17/wib/encodeAffyEc*Testis*.wib .
cd ../../../hg18/encode/wib
ln -s /cluster/data/encode/Affy/2008-03-20/processed/hg18/wib/encodeAffyEc*Testis*.wib .
cd /usr/local/apache/htdocs/goldenPath/hg17/encode/wig
ln -s /cluster/data/encode/Affy/2008-03-20/processed/hg17/download/encodeAffyEc*Testis*.wigVar.gz .
cd ../../../hg18/encode/wig/
ln -s /cluster/data/encode/Affy/2008-03-20/processed/hg18/wigVar/encodeAffyEc*Testis*.wigVar.gz .
popd

# load beds
for db in hg1{7,8}; do
    pushd ${db}/bed
    for bed in *Testis*; do 
        hgLoadBed $db ${bed%.bed} $bed
    done
    popd
done

# load wiggles
for db in hg1{7,8}; do
    pushd ${db}/wigTable
    for tab in *Testis*; do 
        hgLoadWiggle -pathPrefix=/gbdb/${db}/encode/wib $db ${tab%.tab} $tab
    done
    popd
done

