#!/usr/bin/perl
#
# gbDbLoadStep [options] database ...
#
# Load databases on the current server.
#
# $Id: gbDbLoadStep,v 1.50 2009/09/11 06:25:32 genbank Exp $
#
use strict;
use warnings;
use File::Basename;
use FindBin;
use lib "$FindBin::Bin/../lib";
use gbCommon;

my $usage = 
    " gbDbLoadStep [options] database ...\n"
    . "\n"
    . "Load databases on the current server.\n"
    . "\n"
    . "Options:\n"
    . "   -workdir=work/\$host/dbload\n"
    . "   -allowLargeDeletes - override check for deleting large number of\n"
    . "    entries.\n"
    . "   -verbose\n"
    . "   -keep\n"
    . "   -initialLoad\n"
    . "   -srcDb=db - restrict load to srcDb (genbank or refseq)\n"
    . "   -type=type - restrict load to type (mrna or est)\n"
    . "   -orgCat=orgCat - restrict load to orgCat (native or xeno)\n"
    . "   -drop - drop tables before load.  This removes all of genbank and\n"
    . "    refseq data, not just what is being loaded.\n"
    . "   -reload - reload the selected partation of data; use with -srcDb\n"
    . "    and -type.  This does not work on ESTs, use -drop and reload\n"
    . "    all entries, which will be much faster anyway,\n"
    . "   -forceIgnoreDelete - force entries in ignore.idx to be dropped even\n"
    . "    not in gbStatus.  Slow, but a nice way to clean up confusion.\n"
    . "   -reloadList=file - Force the reload of sequences list in this file.\n"
    . "   -gbdbGenBank=dir - Use dir instead of /gbdb/genbank, for testing.\n"
    . "   -rebuildDerived - rebuild genePred and gbMiscDiff tables\n"
    . "   -byPassGbLoaded - by-pass checking gbLoaded table and explictly\n"
    . "    check for new updates.  Needed if a full release was missed.\n"
    . "   -grepIndexRootDir=dir - Location to build grepIndex files. overrides genbank.conf\n"
    . "    grepIndex.rootDir variable.\n"
    . "\n"
    . "If a file:\n"
    . "    var/dbload/\$host/reload.acc\n"
    . "exists, then a reload will be forced for accessions in this file.\n"
    . "If this complates successfully, then the fill will be renamed to:\n"
    . "    var/dbload/\$host/reload.acc.done\n"
    . "A file in the form:\n"
    . "    var/dbload/\$host/reload.\$db.acc\n"
    . "applies only to that database\n";


my $workDir;
my $verboseArg;
my @args;
my $drop = 0;
my $allowLargeDeletes = 0;
my $forceIgnoreDelete = 0;
my $reloadList;
my $varReloadFile;
my $grepIndexRootDir;

# check for a reload.acc file in the right place, if it exists, return
# the path, otherwise return undef
sub getVarReloadFile(;$) {
    my($db) = @_;
    my $rf;
    if (defined($db)) {
        $rf  = "var/dbload/$gbCommon::hostName/reload.${db}.acc";
    } else {
        $rf  = "var/dbload/$gbCommon::hostName/reload.acc";
    }
    if (-e $rf) {
        if (defined($reloadList)) {
            gbError("-reloadList specified and $rf exists");
        }
        if (defined($varReloadFile)) {
            gbError("only one reload file allowed, found $varReloadFile and $rf");
        }
        return $rf;
    } else {
        return undef;
    }
}

# rename a reload file
sub renameVarReloadFile($) {
    my($rf) = @_;
    renameFile($rf, $rf . ".done");
}

# dump gbExtFile table, used to find releases that are no londer needed
sub dumpExtFileTbl($) {
    my($db) = @_;
    my $dumpFile = "$gbCommon::varDir/dbload/$gbCommon::hostName/extFile/" . getDateStamp() . "/$db.extFile.txt";
    makeFileDir($dumpFile);
    runMysql("-N -e 'select * from gbExtFile' $db >$dumpFile.tmp");
    renameFile("$dumpFile.tmp", $dumpFile);
}

# make grep index file, if configured
sub setupGrepIndex($) {
    my($db) = @_;
    if (defined($grepIndexRootDir)) {
        if ($gbCommon::verbose) {
            prMsg("creating grepIndex files for $db");
        }
        runProg("dumpGrepIndex $grepIndexRootDir $db");
    }
}

# drop genbank tables
sub dropTables($) {
    my($db) = @_;
    if ($gbCommon::verbose) {
        prMsg("dropping tables from $db");
    }
    runProg("gbLoadRna -drop $db");    
}

# load ccds and related tables for a db
sub ccdsLoad($$) {
    my($db, $ccdsNcbiBuild) = @_;
    # use profile ccds:ccds except for hgwdev.  This allows any user
    # to load hgwdev without requiring update of .hg.conf files
    my $ccdsDb = ($gbCommon::hostName eq "hgwdev") ? "ccds" : "ccds:ccds";
    runProg("ccdsMkTables -loadDb $ccdsDb $db $ccdsNcbiBuild ccdsInfo ccdsNotes ccdsGene");
    if (haveMysqlTbl($db, "knownGene")) {
        runProg("mkCcdsGeneMap -db=$db -loadDb ccdsGene knownGene ccdsKgMap");
    }
}

# build ccds tables for a db
sub ccdsUpdate($$) {
    my($db, $ccdsNcbiBuild) = @_;
    # note: these are zero if file doesn't exist
    my $ccdsImportTime = loadTimeFile("var/ccdsimport/ccdsimport.time");
    my $ccdsLoadTimeFile = "var/dbload/$gbCommon::hostName/ccdsload.$db.time";
    my $ccdsLoadTime = loadTimeFile($ccdsLoadTimeFile);
    if ($ccdsImportTime > $ccdsLoadTime) {
        ccdsLoad($db, $ccdsNcbiBuild);
        makeTimeFile($ccdsLoadTimeFile);
    }
}

# load a specific database
sub dbLoad($) {
    my($db) = @_;
    my $dbVarReloadFile = getVarReloadFile($db);
    if ($drop) {
        dropTables($db);
    }
    
    my @dbArgs = @args;
    if ($allowLargeDeletes) {
        push(@dbArgs, "-allowLargeDeletes");
    }
    if (defined($dbVarReloadFile)) {
        push(@dbArgs, "-reloadList=$dbVarReloadFile");
    }
    runProg("gbLoadRna " . join(" ", @dbArgs) . " $db");

    if (defined($dbVarReloadFile)) {
        renameVarReloadFile($dbVarReloadFile);
    }
    setupGrepIndex($db);

    my $ccdsNcbiBuild = getDbConfUndef($db, "ccds.ncbiBuild");
    if (defined($ccdsNcbiBuild)) {
        ccdsUpdate($db, $ccdsNcbiBuild);
    }
    dumpExtFileTbl($db);
}


# Entry
my $keep = 0;
my $initialLoad = 0;
my $gbdbGenBank;
my $srcDb;
my $orgCat;
my $type;
my $reload = 0;
my $rebuildDerived = 0;
my $byPassGbLoaded = 0;
while (($#ARGV >= 0) && ($ARGV[0] =~ /^-.*/)) {
    my $opt = $ARGV[0];
    shift @ARGV;
    if ($opt =~ /^-workdir($|=)/) {
        $workDir = parseOptEq($opt);
    } elsif ($opt =~ /^-type($|=)/) {
        $type = parseOptEq($opt);
    } elsif ($opt =~ /^-srcDb($|=)/) {
        $srcDb = parseOptEq($opt);
    } elsif ($opt =~ /^-orgCat($|=)/) {
        $orgCat = parseOptEq($opt);
    } elsif ($opt eq "-drop") {
        $drop = 1;
    } elsif ($opt eq "-reload") {
        $reload = 1;
    } elsif ($opt eq "-verbose") {
        $verboseArg = "-verbose";
        $gbCommon::verbose = 1;
    } elsif ($opt =~ /^-verbose=/) {
        $verboseArg = "-verbose=" . parseOptEq($opt);
        $gbCommon::verbose = 1;
    } elsif ($opt eq "-initialLoad") {
        $initialLoad = 1;
    } elsif ($opt eq "-allowLargeDeletes") {
        $allowLargeDeletes = 1;
    } elsif ($opt eq "-keep") {
        $keep = 1;
    } elsif ($opt =~ /^-gbdbGenBank=/) {
        $gbdbGenBank = parseOptEq($opt);
    } elsif ($opt eq "-forceIgnoreDelete") {
        $forceIgnoreDelete = 1;
    } elsif ($opt =~ /^-reloadList(=|$)/) {
        $reloadList = parseOptEq($opt);
    } elsif ($opt eq "-rebuildDerived") {
        $rebuildDerived = 1;
    } elsif ($opt eq "-byPassGbLoaded") {
        $byPassGbLoaded = 1;
    } elsif ($opt =~ /^-grepIndexRootDir(=|$)/) {
        $grepIndexRootDir = parseOptEq($opt);
    } else {
        gbError("invalid option \"$opt\"\n$usage");
    }
}
if ($#ARGV < 0) {
    gbError("wrong # args: $usage");
}
my @databases = @ARGV;

# check for a reload file on the file system
$varReloadFile = getVarReloadFile();
if (defined($varReloadFile)) {
    $reloadList = $varReloadFile;
}
# set up grepIndex dir (maybe undefined)
if (!defined($grepIndexRootDir)) {
    $grepIndexRootDir = findConf("grepIndex.rootDir");
}

# build standard set of arguments to pass
if (!defined($workDir)) {
    $workDir = "work/$gbCommon::hostName/dbload";
}
push(@args, "-workdir=$workDir");
if ($initialLoad) {
    push(@args, "-initialLoad");
}
if (defined($verboseArg)) {
    push(@args, $verboseArg);
}
if (defined($gbdbGenBank)) {
    push(@args, "-gbdbGenBank=$gbdbGenBank");
}
if (defined($srcDb)) {
    push(@args, "-srcDb=$srcDb");
}
if (defined($orgCat)) {
    push(@args, "-orgCat=$orgCat");
}
if (defined($type)) {
    push(@args, "-type=$type");
}
if (defined($reloadList)) {
    push(@args, "-reloadList=$reloadList");
}
if ($forceIgnoreDelete) {
    push(@args, "-forceIgnoreDelete");
}
if ($rebuildDerived) {
    push(@args, "-rebuildDerived");
}
if ($byPassGbLoaded) {
    push(@args, "-byPassGbLoaded");
}
if ($reload) {
    push(@args, "-reload");
}
             
beginTask("dbload/$gbCommon::hostName", "dbload");
setupHgConf();

# load the database
foreach my $db (@databases) {
    dbLoad($db);
}

# rename reload file if it was used
if (defined($varReloadFile)) {
    renameVarReloadFile($varReloadFile)
}

if (!$keep) {
    runProg("rm -rf $workDir");
}
endTask();

