#!/bin/bash -e
# ensemblDbImport - create a mysql database using a mysql dump from the ensembl ftp site, skipping uneeded tables. Rename chroms to ucsc standard names.
[ $# != 3 ] && {
    echo "usage: ensemblDbImport species version indir " >&2
    exit 1
}
organism=$1
version=$2
indir=$3
delay=4
ensDb=ensembl_${organism}_${version}
input="ftp://ftp.ensembl.org/pub/current_$organism/data/mysql/${organism}_${version}"
sql="${organism}_${version}.sql.gz"
exceptions="density|ditag|dna|dna_align_feature|oligo|protein_align_feature|repeat_feature"
# create mysql ensembl database from downloaded sql and data (drop old one first)
cd $indir
tables=`cat tables.tmp |grep -v -E $exceptions`
echo "Dropping database $ensDb in $delay seconds"
sleep $delay
hgsql mysql -e "drop database if exists $ensDb"
echo "Creating database $ensDb"
hgsql mysql -e "create database $ensDb"
echo "Creating empty tables in database $ensDb from $sql"
zcat $sql | sed -e 's/DEFAULT CHARSET=latin1//' -e 's/AUTO_INCREMENT=[0-9]*//' -e 's/character set latin1 collate latin1_bin//' > create_table.sql 
hgsql $ensDb < create_table.sql 
echo "Loading data"
for i in $tables ; do echo "hgLoadSqlTab $ensDb $i /dev/null $i.txt.table -oldTable ">> dbLoad.log; zcat $i.txt.table.gz > temp.txt ; hgLoadSqlTab $ensDb $i /dev/null temp.txt -oldTable 2>> dbLoad.log >> dbLoad.log; done
# rename chromosomes to conform to ucsc standards
echo "Fix chrom names"
set -x
hgsql $ensDb -e "update seq_region set name = 'chrM' where name = 'MT';"
hgsql $ensDb -e "update seq_region set name = concat('chr',name) where name between 1 and 99;"
hgsql $ensDb -e "update seq_region s, transcript t set s.name = concat('chr',s.name) where s.name in ('M','W','X','Y','Z','Un','M_random','X_random','Y_random','W_random','Z_random','Un_random') and s.seq_region_id = t.seq_region_id;"
hgsql $ensDb -e "update seq_region set name = 'chr6_cox_hap1' where name = 'c6_COX';"
hgsql $ensDb -e "update seq_region set name = 'chr6_qbl_hap2' where name = 'c6_QBL';"
hgsql $ensDb -e "update seq_region set name = 'chr5_h2_hap1' where name = 'c5_H2';"
hgsql $ensDb -e "update seq_region set name = 'chr22_h2_hap1' where name = 'c22_H2';"
#add redundant field to avoid silly outer join limitation in mysql
hgsql $ensDb -e "alter table xref add db_name varchar(27);"
hgsql $ensDb -e "update xref x, external_db e set x.db_name = e.db_name  where x.external_db_id = e.external_db_id;"
