# Download swissprot and create tab separated files from it.
# This will take about 70 minutes for the wgets, and
# 15 minutes for the spToDb step..

ssh hgwdev
mkdir -p /cluster/store11/swissprot/070202/build
ln -s /cluster/store11/swissprot/070202 /cluster/store5/swissprot/070202
cd /cluster/store5/swissprot/070202/build
wget ftp://us.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz
wget ftp://us.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.dat.gz
wget ftp://us.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz
zcat *.dat.gz | spToDb stdin ../tabFiles


# Create the database.  
hgsql mysql <<end
create database sp070202;
end

# Load it up with table definitions from source directory
hgsql sp070202 < ~/kent/src/hg/protein/spToDb/spDb.sql

# Load up the data from tab files.  This takes about 45 minutes.
# (Started at 3:00 done by 4:00)
cd /cluster/store5/swissprot/070202/tabFiles
foreach i (*.txt)
    hgsqlimport --local sp070202 $i
end

# This loaded pretty cleanly, but did get warnings on the
# following tables:
# sp070202.feature: Records: 3534564  Deleted: 0  Skipped: 0  Warnings: 112
# sp070202.organelle: Records: 3553  Deleted: 0  Skipped: 0  Warnings: 3

# Fixed up feature table definition to turn softEndBits from a char to a
# smallint.
# sp070202.feature: Records: 3534564  Deleted: 0  Skipped: 0  Warnings: 0
# Changed organelle.val from varchar to longblob.
# sp070202.organelle: Records: 3553  Deleted: 0  Skipped: 0  Warnings: 0

# Add varsplice info 
zcat ../build/uniprot_sprot_varsplic.fasta.gz | spDbAddVarSplice sp070202 stdin .
hgLoadSqlTab sp070202 -append varProtein /dev/null varProtein.txt
hgLoadSqlTab sp070202 -append protein /dev/null varProtein.txt
hgLoadSqlTab sp070202 -append varAcc /dev/null varAcc.txt
hgLoadSqlTab sp070202 -append displayId /dev/null varDisplayId.txt
hgLoadSqlTab sp070202 -append accToTaxon /dev/null varAccToTaxon.txt
hgLoadSqlTab sp070202 -append geneLogic /dev/null varGeneLogic.txt
hgLoadSqlTab sp070202 -append gene /dev/null varGene.txt
hgLoadSqlTab sp070202 -append description /dev/null varDescription.txt

# Zip up tab files for people who prefer them to database.
gzip *.txt

