# This file describes how the sp041115 and proteins041115 databases were
# made using November 9th, 2004 release of UniProt database files 
# from SWISS-PROT and a few other external databases.

# STARTED ON 11/19/04, DONE ON 11/22/04.

# FIRST PARSE SWISS-PROT RAW DATA FILES AND BUILD sp041115 DB.

# Make subdirectories under /cluster/store8/swissProt

	mkdir /cluster/store8/swissprot/041115
	mkdir /cluster/store8/swissprot/041115/build
	mkdir /cluster/store8/swissprot/041115/tabFiles
	ln -s /cluster/store8/swissprot/041115 /cluster/data/swissprot/041115
	
# mkSwissProtDB.sh is updated so that it takes date from command argument
# instead of using today's date and removed uniprot_trembl_new.dat, since
# it is no longer available.

# Run mkSwissProtDB.sh to parse Swiss-Prot raw input files.

	ssh kksilo
	cd /cluster/data/swissprot/041115
	~/src/hg/protein/mkSwissProtDB.sh 041115

# Go to hgwdev and run mkSwissProtDB.sh again to create and load sp041115 database

      ssh hgwdev
	~/src/hg/protein/mkSwissProtDB.sh 041115

# NEXT BUILD proteins041115 DB

# Updated mkProteinsDB.sh script then run it to create proteins041115 DB.

   ~/src/hg/protein/mkProteinsDB.sh 041115

# create indice of the spXref2 table, took about 10 minutes each.

    hgsql proteins041115 -e 'CREATE INDEX i1 ON spXref2(accession(6))'
    hgsql proteins041115 -e 'CREATE INDEX i2 ON spXref2(displayID(10))'
    hgsql proteins041115 -e 'CREATE INDEX i3 ON spXref2(extAC(24))'

# BUILD TABLES FOR pbGlobal (PB V1.1)

    cd /cluster/data/proteins/041115
    mkdir pbGlobal
    cd pbGlobal

# Calculate Pi values for all proteins

    hgsql sp041115 -e "select acc from protein" >acc041115.lis
    nice pbCalPi acc041115.lis sp041115 pi041115.tab
    hgsql proteins041115 -e 'load data local infile "pi041115.tab" into table pepPi;'
   
# Build pepMwAa table

    hgsql sp041115 -e "select acc, molWeight, aaSize from info" >pepMwAa.tab
    hgsql proteins041115 -e 'load data local infile "pepMwAa.tab" into table pepMwAa ignore 1 lines'

# Calculate global protein property distributions

       pbCalDistGlobal sp041115 proteins041115

# This took about 35 minutes.
# Load the tables

       hgsql proteins041115 -e 'load data local infile "pepCCntDist.tab"  into table pepCCntDist'
       hgsql proteins041115 -e 'load data local infile "pepHydroDist.tab" into table pepHydroDist'
       hgsql proteins041115 -e 'load data local infile "pepIPCntDist.tab" into table pepIPCntDist'
       hgsql proteins041115 -e 'load data local infile "pepMolWtDist.tab" into table pepMolWtDist'
       hgsql proteins041115 -e 'load data local infile "pepPiDist.tab"    into table pepPiDist'
       hgsql proteins041115 -e 'load data local infile "pepResDist.tab"   into table pepResDist'

# Calculate global AA residue distributions

       pbCalResStdGlobal 041115

# Load distribution tables:

        hgsql proteins041115 -e 'load data local infile "pbAnomLimit.tab" into table pbAnomLimit'
        hgsql proteins041115 -e 'load data local infile "pbResAvgStd.tab" into table pbResAvgStd'

# Get taxonomy names table from NCBI.

      cd /cluster/data/proteins/041115
	mkdir taxon
	cd taxon
	wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip
	unzip taxdmp.zip

# Create table taxonNames in proteins041115

     	CREATE TABLE taxonNames (
     	id int not null,                # Taxon NCBI ID
     	name varchar(255) not null,     # Binomial format name
     	info varchar(255),              # other info
     	nameType varchar(255) not null, # name type
     	#Indices
     	INDEX(id)
     	);
     
# Load from the file names.dmp into taxonNames table.
     
   load data local infile "names.dmp" into table taxonNames fields terminated by '|' enclosed by '\t';

# Load and edit pbStamp table

      cd /cluster/data/proteins/041115
hgsql proteins040915 -e "select * from pbStamp" > pbStamp.tab
	hgsql proteins041115 -e 'load data local infile "pbStamp.tab" into table pbStamp ignore 1 lines'

# First check to see if pbGateway and pbGlobal are working.

# Then edit pbStamp.tab to adjust maximum y values for various stamps 
# and load it to pbStamp tables until all their scales look reasonable.
# For this particular release, no adjustment seems necessary.	

# SWITCH SYMBOLIC PROTEIN DATABASE LINKS

# Ask system admin to switch the following symbolic database links:

       swissProt --> sp041115
       proteins  --> proteins041115

# Run some simple test on hgTracks, hgNear, hgGene, pbTracks, and pbGlobal  
# to make sure things are running OK.
    
# Release to QA for formal testing.

