#!/usr/bin/perl
my $usage = 
    "gbDownloadStep [-verbose] [-ftpVerbose] [-genbank] [-refseq] [-warnOnCheckError]\n"
    . "\n"
    . "Download required genbank and refseq data files.  The result will reflect\n"
    . "the current state of NCBI ftp site, although only a subset of files will be\n"
    . "retrieved.\n"
    . "\n"
    . "It can be run in an empty directory or one where the script has been\n"
    . "previously run.  It handles rollovers to new genbank releases automatically.\n"
    . "This uses the built-in ftp module to avoid problems with error reporting\n"
    . "in other ftp clients and to implement very strict and safe download\n"
    . "procedures.\n"
    . "\n"
    . "Options\n"
    . "  -verbose - print details\n"
    . "  -ftpVerbose - print details of interaction with ftp server\n"
    . "  -genbank - only update genbank\n"
    . "  -refseq - only update refseq\n"
    . "  -warnOnCheckError - issue a warning instead of an error if check of existing file\n"
    . "   against ftp site fails.  This can be enabled on individual files by create\n"
    . "   an empty file with the same path and the extension .warnOnCheckError\n"
    . "\n";

#
# $Id: gbDownloadStep,v 1.16 2010/04/11 06:48:30 markd Exp $
#

use strict;
use warnings;
use File::Basename;
use FindBin;
use lib "$FindBin::Bin/../lib";
use Net::FTP;
use gbCommon;
use gbFtp;
setupServerPath();

my $useAsn1 = 0;  # download ASN.1 files instead of GBFF

# Constants
my $FTP_HOST = "ftp.ncbi.nih.gov";

# other globals
my $tmpDir = "tmp";

# globals
$main::verbose = 0;
$main::genbankUpd = 0;
$main::refseqUpd = 0;
$main::warnOnCheckError = 0;

# should a warning be generated if the specified file doesn't match server?
sub shouldWarnOnCheckError($) {
    my ($localFile)  = @_;
    return $main::warnOnCheckError || (-e $localFile.".warnOnCheckError");
}

# determine of version of GenBank on the NCBI server from the 
# GB_Release_Number file.
sub getGenBankVer() {
    ftpOpen();;
    my $remRelNum = $useAsn1 ? "ncbi-asn1/GB_Release_Number"
        : "genbank/GB_Release_Number";
    my $locRelNum = "$tmpDir/GB_Release_Number";
    unlink $locRelNum;
    ftpSafeGet($remRelNum, $locRelNum);
    my $version = readFile($locRelNum);
    chomp $version;
    $version .= ".0";
    unlink($locRelNum);
    return $version;
}

# determine of version of RefSeq on the NCBI server by examining the
# release/release-catalog directory
sub getRefSeqVer() {
    ftpOpen();;
    my @catFiles = ftpGetDirList("refseq/release/release-catalog");
    my $version;
    foreach my $file (@catFiles) {
        if (($version) = ($file =~ m/release-catalog\/release(.+)\.files\.installed$/)) {
            last;
        }
    }
    if (!defined($version)) {
        gbError("can't find ftp://$FTP_HOST/refseq/release/release-catalog/release*.files.installed");
    }
    return $version;
}

# Download a full release.  Of justCheck is set, just validate local
# file existance and size.  Return true if files were downloaded.
sub downloadOrCheckFull($$$$;$) {
    my($justCheck, $remDir, $outDir, $fileRE, $excludeRE) = @_;

    my @files = ftpGetDirList($remDir);
    my @gotFiles = ();

    if ($main::verbose) {
        if ($justCheck) {
            print STDERR "check of $remDir with $outDir\n";
        } else {
            print STDERR "full download of $remDir to $outDir\n";
        }
    }

    my($remFile);
    foreach $remFile (@files) {
        my $file = basename($remFile);
        if (($file =~ $fileRE)
            && (!defined($excludeRE) || !($file =~ $excludeRE))) {
            my $localFile = "$outDir/$file";
            if (ftpGetOrCheck($justCheck, shouldWarnOnCheckError($localFile), $remFile, $localFile)) {
                push(@gotFiles, $localFile);
            }
        } else {
            if ($main::verbose) {
                print STDERR "skip remote file: $remFile\n";
            }
        }
    }
    if ($#gotFiles >= 0) {
        if ($justCheck) {
            die("BUG: shouldn't be here");
        }
        md5Files("$outDir/full.md5", @gotFiles);
    }
    return ($#gotFiles >= 0);
}

# Download or check daily files.
sub downloadOrCheckDaily($$$) {
    my($remDir, $outDir, $fileRE) = @_;
    my @files = ftpGetDirList($remDir);

    if ($main::verbose) {
        print STDERR "download or check of $remDir to $outDir\n";
    }

    my($remFile);
    foreach $remFile (@files) {
        my $file = basename($remFile);
        if ($file =~ $fileRE) {
            my $localFile = "$outDir/$file";
            if (ftpGetOrCheck(0, shouldWarnOnCheckError($localFile), $remFile, $localFile)) {
                md5Files("$outDir/$file.md5", $localFile);
            }
        } else {
            if ($main::verbose) {
                print STDERR "skip remote file: $remFile\n";
            }
        }
    }
}

# Update GenBank files. If no directory exists for the curent genbank
# version, download the full release. Get new, non-cumulative daily
# files.
sub updateGenBank() {
    my($version) = getGenBankVer();
    my($fullRE, $dailyRE, $fullExcludeRE, $ftpDir);
    if ($useAsn1) {
        $ftpDir = "ncbi-asn1";
        $fullRE = qr/(^gb.*\.aso\.gz$)|(^README.*$)|(^GB_Release_Number$)/;
        $dailyRE= qr/^nc.*\.aso\.gz$/;
    } else {
        $ftpDir = "genbank";
        $fullRE = qr/(^gb.*\.seq\.gz$)|(^README.*$)|(^gbacc\.idx\.gz$)|(^GB_Release_Number$)/;
        $dailyRE= qr/^nc.*\.flat\.gz$/;
    }
    ftpOpen();

    my $genBankDir = "data/download/genbank.$version";

    # Either get or check full release.  Explict get releases notes from flat-file dir,
    # as they are not in the asn1 directory
    my $justCheck = -e "$genBankDir/full.md5";

    downloadOrCheckFull($justCheck, $ftpDir, $genBankDir, $fullRE, $fullExcludeRE);
    my $localFile = "$genBankDir/gbrel.txt";
    ftpGetOrCheck($justCheck,  shouldWarnOnCheckError($localFile), "genbank/gbrel.txt", $localFile);

    # Check summing the files takes enough time that the conection may time out, so reopen.
    ftpOpen();

    # now get or verify dailies
    downloadOrCheckDaily("$ftpDir/daily-nc", "$genBankDir/daily-nc", $dailyRE);

    ftpClose();
    return $version;
}

# Update RefSeq files. If no directory exists for the curent RefSeq version,
# download the full release. Get new, non-cumulative daily files.  
sub updateRefSeq() {
    my($version) = getRefSeqVer();
    my($fullRE, $dailyRE);
    if ($useAsn1) {
        $fullRE = qr/(^complete.+\.bna\.gz$)/;
        $dailyRE = qr/(^rsnc.*\.bna\.gz$)/;
    } else {
        $fullRE = qr/(^complete.+.rna\.gbff\.gz$)/;
        $dailyRE = qr/(^rsnc.*\.gbff\.gz$)/;
    }

    ftpOpen();

    my $refSeqDir = "data/download/refseq.$version";
    my $justCheck = -e "$refSeqDir/release/complete/full.md5";

    downloadOrCheckFull($justCheck, "refseq/release/complete", "$refSeqDir/release/complete", $fullRE);

    # Check summing the files takes enough time that the conection may time out, so reopen.
    ftpOpen();

    # Get or verify dailies. We just download all dailiess, even if they are
    # before cummulative download.  It's too risky to rely on the times.
    downloadOrCheckDaily("refseq/daily", "$refSeqDir/daily", $dailyRE);
    ftpClose();
}

# Entry point
while (($#ARGV >= 0) && ($ARGV[0] =~/^-.*/)) {
    my $opt = $ARGV[0];
    shift @ARGV;
    if ($opt eq "-verbose") {
        $main::verbose = 1;
        $gbCommon::verbose = 1;
    } elsif ($opt eq "-ftpVerbose") {
        $gbFtp::verbose = 1;
    } elsif ($opt eq "-genbank") {
        $main::genbankUpd = 1;
    } elsif ($opt eq "-refseq") {
        $main::refseqUpd = 1;
    } elsif ($opt eq "-warnOnCheckError") {
        $main::warnOnCheckError = 1;
    } else {
        gbError("invalid option \"$opt\"");
    }
}
if (!($main::genbankUpd || $main::refseqUpd)) {
    $main::genbankUpd = 1;
    $main::refseqUpd = 1;
}

if ($#ARGV >= 0) {
    die("Wrong # args: $usage");
}
checkOnBuildServer();

beginTask("build", "download");
ftpInit($FTP_HOST);

if ($main::genbankUpd) {
    updateGenBank();
}
if ($main::refseqUpd) {
    updateRefSeq();
}
ftpClose();
endTask();
