
2011-05-25: import of GENCODE V7 (markd)
    # Due to UCSC Genome Browser using the NC_001807 mitochondrial genome sequence
    # (chrM) and GENCODE annotating the NC_012920 mitochondrial sequence, the
    # GENCODE mitochondrial sequences are not loaded

    # download files
    mkdir -p /hive/groups/encode/dcc/data/gencodeV7/release
    cd /hive/groups/encode/dcc/data/gencodeV7/release

    wget ftp://ftp.sanger.ac.uk/pub/gencode/release_7/gencode.v7.2wayconspseudos.gtf.gz
    wget ftp://ftp.sanger.ac.uk/pub/gencode/release_7/gencode.v7.annotation.gtf.gz
    wget ftp://ftp.sanger.ac.uk/pub/gencode/release_7/gencode.v7.noncoding_RNAs.fa.gz
    wget ftp://ftp.sanger.ac.uk/pub/gencode/release_7/gencode.v7.noncoding_RNAs.gtf.gz
    wget ftp://ftp.sanger.ac.uk/pub/gencode/release_7/gencode.v7.pc_transcripts.fa.gz
    wget ftp://ftp.sanger.ac.uk/pub/gencode/release_7/gencode.v7.pc_translations.fa.gz
    wget ftp://ftp.sanger.ac.uk/pub/gencode/release_7/gencode.v7.polyAs.gtf.gz
    wget ftp://ftp.sanger.ac.uk/pub/gencode/release_7/gencode.v7.tRNAs.gtf.gz
    wget ftp://ftp.sanger.ac.uk/pub/gencode/release_7/gencode7_GRCh37.tgz

    # silly sanity check:
    for f in * ; do zcat $f >/dev/null ; done

    # untar main distribution
    tar -zxf gencode7_GRCh37.tgz
    
    # created Makefile to build and load all tables.  This is dependent
    # on code in the CCDS subversion tree:
    #   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/svnroot/hausslerlab/ccds/trunk
    # and markd's python library (it will be moved to the hausslerlab
    # repository soon)

    cd /hive/groups/encode/dcc/data/gencodeV7/
    (time nice make) >&build.out&
    # took 48 minutes, log below

------------------------------------------------------------------------------
cat release/gencode_release_7/gencode.v7.annotation.level_1_2.gtf release/gencode_release_7/gencode.v7.annotation.level_3.gtf | gencodeGtfToGenePred /dev/stdin data/gencodeManAuto.gp.hgwdev.5943.tmp
mv -f data/gencodeManAuto.gp.hgwdev.5943.tmp data/gencodeManAuto.gp
cat release/gencode_release_7/gencode.v7.annotation.level_1_2.gtf release/gencode_release_7/gencode.v7.annotation.level_3.gtf | gencodeGtfToAttrs /dev/stdin data/gencodeManAuto.tsv.hgwdev.5943.tmp
mv -f data/gencodeManAuto.tsv.hgwdev.5943.tmp data/gencodeManAuto.tsv
gencodeMakeTracks --excludeChrom=chrM $(echo Basic | tr A-Z a-z) data/gencodeManAuto.gp data/gencodeManAuto.tsv tables/wgEncodeGencodeBasicV7.gp.hgwdev.5943.tmp
mv -f tables/wgEncodeGencodeBasicV7.gp.hgwdev.5943.tmp tables/wgEncodeGencodeBasicV7.gp
gencodeMakeTracks --excludeChrom=chrM $(echo Comp | tr A-Z a-z) data/gencodeManAuto.gp data/gencodeManAuto.tsv tables/wgEncodeGencodeCompV7.gp.hgwdev.5943.tmp
mv -f tables/wgEncodeGencodeCompV7.gp.hgwdev.5943.tmp tables/wgEncodeGencodeCompV7.gp
gencodeMakeTracks --excludeChrom=chrM $(echo PseudoGene | tr A-Z a-z) data/gencodeManAuto.gp data/gencodeManAuto.tsv tables/wgEncodeGencodePseudoGeneV7.gp.hgwdev.5943.tmp
mv -f tables/wgEncodeGencodePseudoGeneV7.gp.hgwdev.5943.tmp tables/wgEncodeGencodePseudoGeneV7.gp
gencodePolyaGtfToGenePred release/gencode_release_7/gencode.v7.polyAs.gtf tables/wgEncodeGencodePolyaV7.gp.hgwdev.5943.tmp
mv -f tables/wgEncodeGencodePolyaV7.gp.hgwdev.5943.tmp tables/wgEncodeGencodePolyaV7.gp
tawk '$3=="transcript"{$3 = "exon"} {print $0}' release/gencode_release_7/gencode.v7.2wayconspseudos.GRCh37.gtf | gtfToGenePred stdin tables/wgEncodeGencode2wayConsPseudoV7.gp.hgwdev.5943.tmp
mv -f tables/wgEncodeGencode2wayConsPseudoV7.gp.hgwdev.5943.tmp tables/wgEncodeGencode2wayConsPseudoV7.gp
gencodeMakeAttrs --excludeChrom=chrM data/gencodeManAuto.gp data/gencodeManAuto.tsv tables/wgEncodeGencodeAttrsV7.tab.hgwdev.5943.tmp tables/wgEncodeGencodeTagV7.tab
mv -f tables/wgEncodeGencodeAttrsV7.tab.hgwdev.5943.tmp tables/wgEncodeGencodeAttrsV7.tab
mkdir -p tables/
cp release/gencode_release_7/metadata/gencode.v7.metadata.Gene_source tables/wgEncodeGencodeGeneSourceV7.tab.hgwdev.5943.tmp
mv -f tables/wgEncodeGencodeGeneSourceV7.tab.hgwdev.5943.tmp tables/wgEncodeGencodeGeneSourceV7.tab
mkdir -p tables/
cp release/gencode_release_7/metadata/gencode.v7.metadata.Transcript_source tables/wgEncodeGencodeTranscriptSourceV7.tab.hgwdev.5943.tmp
mv -f tables/wgEncodeGencodeTranscriptSourceV7.tab.hgwdev.5943.tmp tables/wgEncodeGencodeTranscriptSourceV7.tab
mkdir -p tables/
cp release/gencode_release_7/metadata/gencode.v7.metadata.Transcript_supporting_feature tables/wgEncodeGencodeTranscriptSupportV7.tab.hgwdev.5943.tmp
mv -f tables/wgEncodeGencodeTranscriptSupportV7.tab.hgwdev.5943.tmp tables/wgEncodeGencodeTranscriptSupportV7.tab
tawk '{split($5,coord,":|-"); print $1,$2,$3,$4,coord[1],coord[2]-1,coord[3]}' release/gencode_release_7/metadata/gencode.v7.metadata.Exon_supporting_feature | sort -k 1,1 -k 2,2 -k 5,5 -k 6,6n > tables/wgEncodeGencodeExonSupportV7.tab.hgwdev.5943.tmp
mv -f tables/wgEncodeGencodeExonSupportV7.tab.hgwdev.5943.tmp tables/wgEncodeGencodeExonSupportV7.tab
mkdir -p tables/
cp release/gencode_release_7/metadata/gencode.v7.metadata.PDB tables/wgEncodeGencodePdbV7.tab.hgwdev.5943.tmp
mv -f tables/wgEncodeGencodePdbV7.tab.hgwdev.5943.tmp tables/wgEncodeGencodePdbV7.tab
mkdir -p tables/
cp release/gencode_release_7/metadata/gencode.v7.metadata.Pubmed_id tables/wgEncodeGencodePubMedV7.tab.hgwdev.5943.tmp
mv -f tables/wgEncodeGencodePubMedV7.tab.hgwdev.5943.tmp tables/wgEncodeGencodePubMedV7.tab
mkdir -p tables/
cp release/gencode_release_7/metadata/gencode.v7.metadata.RefSeq tables/wgEncodeGencodeRefSeqV7.tab.hgwdev.5943.tmp
mv -f tables/wgEncodeGencodeRefSeqV7.tab.hgwdev.5943.tmp tables/wgEncodeGencodeRefSeqV7.tab
(tawk '{print $0,"SwissProt"}' release/gencode_release_7/metadata/gencode.v7.metadata.SwissProt && tawk '{print $0,"TrEMBL"}' release/gencode_release_7/metadata/gencode.v7.metadata.TrEMBL) | sort -k 1,1 > tables/wgEncodeGencodeUniProtV7.tab.hgwdev.5943.tmp
mv -f tables/wgEncodeGencodeUniProtV7.tab.hgwdev.5943.tmp tables/wgEncodeGencodeUniProtV7.tab
hgLoadGenePred -genePredExt hg19 wgEncodeGencodeBasicV7 tables/wgEncodeGencodeBasicV7.gp
touch loaded/wgEncodeGencodeBasicV7.genePredExt.loaded
hgLoadGenePred -genePredExt hg19 wgEncodeGencodeCompV7 tables/wgEncodeGencodeCompV7.gp
touch loaded/wgEncodeGencodeCompV7.genePredExt.loaded
hgLoadGenePred -genePredExt hg19 wgEncodeGencodePseudoGeneV7 tables/wgEncodeGencodePseudoGeneV7.gp
touch loaded/wgEncodeGencodePseudoGeneV7.genePredExt.loaded
hgLoadGenePred -genePredExt hg19 wgEncodeGencodePolyaV7 tables/wgEncodeGencodePolyaV7.gp
touch loaded/wgEncodeGencodePolyaV7.genePredExt.loaded
hgLoadGenePred hg19 wgEncodeGencode2wayConsPseudoV7 tables/wgEncodeGencode2wayConsPseudoV7.gp
touch loaded/wgEncodeGencode2wayConsPseudoV7.genePred.loaded
mkdir -p loaded/
hgLoadSqlTab hg19 wgEncodeGencodeAttrsV7 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeAttrs.sql tables/wgEncodeGencodeAttrsV7.tab 
Scanning through 1 files
touch loaded/wgEncodeGencodeAttrsV7.tab.loaded
mkdir -p loaded/
hgLoadSqlTab hg19 wgEncodeGencodeTagV7 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeTag.sql tables/wgEncodeGencodeTagV7.tab 
Scanning through 1 files
touch loaded/wgEncodeGencodeTagV7.tab.loaded
mkdir -p loaded/
hgLoadSqlTab hg19 wgEncodeGencodeGeneSourceV7 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeGeneSource.sql tables/wgEncodeGencodeGeneSourceV7.tab 
Scanning through 1 files
touch loaded/wgEncodeGencodeGeneSourceV7.tab.loaded
mkdir -p loaded/
hgLoadSqlTab hg19 wgEncodeGencodeTranscriptSourceV7 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeTranscriptSource.sql tables/wgEncodeGencodeTranscriptSourceV7.tab 
Scanning through 1 files
touch loaded/wgEncodeGencodeTranscriptSourceV7.tab.loaded
mkdir -p loaded/
hgLoadSqlTab hg19 wgEncodeGencodeTranscriptSupportV7 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeTranscriptSupport.sql tables/wgEncodeGencodeTranscriptSupportV7.tab 
Scanning through 1 files
touch loaded/wgEncodeGencodeTranscriptSupportV7.tab.loaded
mkdir -p loaded/
hgLoadSqlTab hg19 wgEncodeGencodeExonSupportV7 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeExonSupport.sql tables/wgEncodeGencodeExonSupportV7.tab 
Scanning through 1 files
touch loaded/wgEncodeGencodeExonSupportV7.tab.loaded
mkdir -p loaded/
hgLoadSqlTab hg19 wgEncodeGencodePdbV7 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodePdb.sql tables/wgEncodeGencodePdbV7.tab 
Scanning through 1 files
touch loaded/wgEncodeGencodePdbV7.tab.loaded
mkdir -p loaded/
hgLoadSqlTab hg19 wgEncodeGencodePubMedV7 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodePubMed.sql tables/wgEncodeGencodePubMedV7.tab 
Scanning through 1 files
touch loaded/wgEncodeGencodePubMedV7.tab.loaded
mkdir -p loaded/
hgLoadSqlTab hg19 wgEncodeGencodeRefSeqV7 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeRefSeq.sql tables/wgEncodeGencodeRefSeqV7.tab 
Scanning through 1 files
touch loaded/wgEncodeGencodeRefSeqV7.tab.loaded
mkdir -p loaded/
hgLoadSqlTab hg19 wgEncodeGencodeUniProtV7 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeUniProt.sql tables/wgEncodeGencodeUniProtV7.tab 
Scanning through 1 files
touch loaded/wgEncodeGencodeUniProtV7.tab.loaded
mkdir -p check/
hgsql -Ne 'select geneId from wgEncodeGencodeAttrsV7 where geneId not in (select geneId from wgEncodeGencodeGeneSourceV7)' hg19 | sort -u >check/wgEncodeGencodeGeneSourceV7.missing
touch check/wgEncodeGencodeGeneSourceV7.checked
mkdir -p check/
hgsql -Ne 'select transcriptId from wgEncodeGencodeAttrsV7 where transcriptId not in (select transcriptId from wgEncodeGencodeTranscriptSourceV7)' hg19 | sort -u >check/wgEncodeGencodeTranscriptSourceV7.missing
touch check/wgEncodeGencodeTranscriptSourceV7.checked

real	48m0.763s
user	45m38.405s
sys	0m32.747s
==============================================================================
