#!/usr/bin/perl
my $usage = 
    " gbProcessStep [-keep] [-verbose] [-mkOrganismList]\n"
    . "\n"
    . "Process downloaded genbank and refseq data files that have not been\n"
    . "processed.  This generates, *.ra, *.fa, and *.gbidx files for\n"
    . "each downloaded file. It also splits them into mRNAs and ESTs.\n"
    . "\n"
    . "It checks all full and daily download directories and see if they need\n"
    . "processed.  This code maps the download directory names, which follow the\n"
    . "NCBI layout, to a consistent layout.\n"
    . "Can process either flat-file or ASN.1 download files.\n"
    . "\n"
    . "Options\n"
    . "  -keep - keep tmp files for debugging.\n"
    . "  -mkOrganismList - update the organism list\n"
    . "  -verbose - print details\n";
#
# $Id: gbProcessStep,v 1.14 2010/04/11 06:48:31 markd Exp $
#
use strict;
use warnings;
use File::Basename;
use FindBin;
use lib "$FindBin::Bin/../lib";
use gbCommon;
setupServerPath();

# Generate a file name (or glob) for a file name with an access prefix
sub getAccPrefixedFile($$) {
    my($prefix, $path) = @_;
    my @baseParts = split(/\./, basename($path));
    return dirname($path) . "/" . $baseParts[0] . $prefix . "."
        . join('.', @baseParts[1 .. $#baseParts]);
}

# Convert a list of base output file names to a list of existing files,
# If ext is defined, it is appended.  For ESTs, glob to find all files
# generated from accessions.
sub getOutFileList($$@) {
    my($type, $ext, @baseFiles) = @_;
    my @outFiles;
    foreach my $bf (@baseFiles) {
        if (defined($ext)) {
            $bf .= $ext;
        }
        if ($type eq "est") {
            push(@outFiles, glob(getAccPrefixedFile("*", $bf)));
        } elsif (-e $bf) {
            push(@outFiles, $bf);
        }
    }
    return @outFiles;
}

# Rename a list of files, removing (or replacing an extension).  Also
# write protect.
sub renameOutFiles($$@) {
    my($oldExt, $newExt, @files) = @_;
    gbChmod(@files);
    my $oldExpExpr = quotemeta($oldExt) . 
    my @newFiles;
    foreach my $oldf (@files) {
        my $newf = dirname($oldf) . "/"  . basename($oldf, $oldExt);
        if (defined($newExt)) {
            $newf .= $newExt;
        }
        renameFile($oldf, $newf);
        push(@newFiles, $newf);
    }
    return @newFiles;
}

# process a set of sequences
sub processSeqs($$$@) {
    my($type, $outDir, $srcDb, @gbFiles) = @_;
    prMsg("processing $type $outDir");
    if ($#gbFiles < 0) {
        die("no gbFiles for $type $srcDb $outDir");
    }

    # remove old data files in case this is rerunning due to failure
    makeDir($outDir);
    unlink(glob("$outDir/$type.*"));

    # Create file in an atomic manner.  For ESTs we split by first two
    # letters of accession, so we can have multiple.
    my $gbidx = "$outDir/$type.gbidx";
    my $fa = "$outDir/$type.fa";
    my $ra = "$outDir/$type.ra";
    my $pepFa = "$outDir/pep.fa";

    # remove all old files
    unlink(getOutFileList($type, undef, ($gbidx, $fa, $ra, $pepFa)));
    unlink(getOutFileList($type, ".tmp", ($gbidx, $fa, $ra, $pepFa)));
    makeDir($outDir);    

    # generate files with .tmp extension
    my $cmd = "gbProcess -gbidx=$gbidx.tmp";
    if ($type eq "est") {
        $cmd .= " -type=est -byAccPrefix=2";
    } else {
        $cmd .= " -type=mrna";
    }
    if ($srcDb eq "refseq") {
        # add option to output peptide seqs for refseq
        $cmd .= " -pepFa=$pepFa.tmp";
    }
    $cmd .= " $fa.tmp $ra.tmp " . join(" ", @gbFiles);
    runProg($cmd);

    # process resulting output files
    my $md5File = "$outDir/$type.md5";
    my @raTmpFiles = getOutFileList($type, ".tmp", $ra);
    my @faTmpFiles = getOutFileList($type, ".tmp", ($fa, $pepFa));
    my @gbIdxTmpFiles = getOutFileList($type, ".tmp", $gbidx);

    if ($#raTmpFiles >= 0) {
        # compress ra files and rename
        runProg("gzip -1 " . join(" ", @raTmpFiles));
        my @raGzTmpFiles = getOutFileList($type, ".tmp.gz", $ra);
        my @outFiles = renameOutFiles(".tmp.gz", ".gz", @raGzTmpFiles);

        # rename fasta files
        push(@outFiles, renameOutFiles(".tmp", undef, @faTmpFiles));

        # rename gbidx files.  This MUST be last, as it's an indication
        # that the other files are done
        push(@outFiles, renameOutFiles(".tmp", undef, @gbIdxTmpFiles));

        if ($#outFiles < 0) {
            die("bug: must have output files if here");
        }

        # Checksum files, which flags successful completion.
        md5Files($md5File, @outFiles);
    } else {
        # no files create for this type, create an empty file md4 file
        open(md5File, ">", $md5File) || die("can't create $md5File");
        close(md5File) || die("close failed");
    }
}

# Process a GenBank release. The full release is split into may gb*.seq.gz
# files.  To avoid missing mRNAs if a new division is created, we explictly
# skip files we know will not contain mRNAs and process the rest.  This is
# done for speed, things are sorted out by type in gbProcessSeqs.  ESTs are
# all in their own division.
sub processGenBankRel($) {
    my($rel) = @_;
    my $gbMRnaSkip = "(/gbest.*)|(/gbgss.*)|(/gbhtg.*)|(/gbsts.*)|(/gbcon.*)";
    my $gbMRnaSkipRE = qr/$gbMRnaSkip/;

    my $downDir = "data/download/$rel";
    my $procDir = "data/processed/$rel/full";

    # do full mRNAs
    if (! -e "$procDir/mrna.md5") {
        # build list of files
        my @gbFiles = ();
        my @downFiles = (glob("$downDir/gb*.aso.gz"),
                         glob("$downDir/gb*.seq.gz"));
        foreach my $gbFile (@downFiles) {
            if (!($gbFile =~ $gbMRnaSkipRE)) {
                push(@gbFiles, $gbFile);
            }
        }
        if ($#gbFiles >= 0) {
            processSeqs("mrna", $procDir, "genbank", @gbFiles);
        }
    }
    
    # do full ESTs
    if (! -e "$procDir/est.md5") {
        # build list of files
        my @gbFiles = (glob("$downDir/gbest*.aso.gz"),
                       glob("$downDir/gbest*.seq.gz"));
        if ($#gbFiles >= 0) {
            processSeqs("est", $procDir, "genbank", @gbFiles);
        }
    }

    # genbank daily mRNAs and ESTS
    my @downFiles = (glob("$downDir/daily-nc/nc*.aso.gz"),
                     glob("$downDir/daily-nc/nc*.flat.gz"));
    foreach my $gbFile (@downFiles) {
        my $update = $gbFile;
        $update =~ s/^.*\/nc(.+)\.(flat|aso)\.gz$/daily.$1/;
        $procDir = "data/processed/$rel/$update";
        if (! -e "$procDir/mrna.md5") {
            processSeqs("mrna", $procDir, "genbank", ($gbFile));
        }
        if (! -e "$procDir/est.md5") {
            processSeqs("est", $procDir, "genbank", ($gbFile));
        }
    }
}

# process a RefSeq release
sub processRefSeqRel($) {
    my($rel) = @_;
    my $downDir = "data/download/$rel";
    my $procDir = "data/processed/$rel/full";

    # full mRNAs
    if (! -e "$procDir/mrna.md5") {
        my @gbFiles = (glob("$downDir/release/complete/complete*.rna.aso.gz"),
                       glob("$downDir/release/complete/complete*.rna.gbff.gz"));
        processSeqs("mrna", $procDir, "refseq", @gbFiles);
    }

    # daily mRNAs,
    my @downFiles = (glob("$downDir/daily/rsnc.*.*.aso.gz"),
                     glob("$downDir/daily/rsnc.*.*.gbff.Z"),
                     glob("$downDir/daily/rsnc.*.*.gbff.gz"));
    foreach my $gbFile (@downFiles) {
        # switch month-day and year so they sort sanely
        my $update = $gbFile;
        $update =~ s/^.*\/rsnc\.([0-9]+)\.([0-9]+)\.(gbff|aso)\.(Z|gz)$/daily.$2.$1/;
        $procDir = "data/processed/$rel/$update";
        if (! -e "$procDir/mrna.md5") {
            processSeqs("mrna", $procDir, "refseq", $gbFile);
        }
    }
}

# create the current organism list
sub createOrganismList() {
    my $lst = "data/organism.lst";
    my $lstTmp = "$lst.tmp";
    runProg("gbMkOrganismList $lstTmp");
    renameFile($lstTmp, $lst);
}

# entry
$main::keep = 0;
$main::verbose = 0;
$main::mkOrganismList = 0;
while (($#ARGV >= 0) && ($ARGV[0] =~/^-.*/)) {
    my $opt = $ARGV[0];
    shift @ARGV;
    if ($opt eq "-keep") {
        $main::keep = 1;
    } elsif ($opt eq "-verbose") {
        $main::verbose = 1;
        $gbCommon::verbose = 1;
    } elsif ($opt eq "-mkOrganismList") {
        $main::mkOrganismList = 1;
    } else {
        gbError("invalid option \"$opt\"");
    }
}
if ($#ARGV >= 0) {
    gbError("wrong # args: $usage");
}
checkOnBuildServer();

beginTask("build", "process");

my($file, $rel);

# build list of all releases.  We allow either genbank or refseq to be
# missing for testing purposes.
my @gbReleases = getReleases("download", "genbank");
my @rsReleases = getReleases("download", "refseq");
if (($#gbReleases < 0) && ($#rsReleases < 0)) {
    gbError("no data/download/genbank.* or data/download/refseq.* directories found in "
            . `pwd`);
}

# GenBank releases
foreach $rel (@gbReleases) {
    processGenBankRel($rel);
}

# refseq full
foreach $rel (@rsReleases) {
    processRefSeqRel($rel);
}

# do some validations to catch things now rather than when loading the database
runProg("gbProcessedCheck");

if ($main::mkOrganismList) {
    createOrganismList();
}

endTask();

