#!/usr/bin/awk -f 
#
# dbload-times dblog1 [dblog2 ...]
#
# Analyze gbLoadRna log files and generate report of were time is spent,
# Probably need at least -verbose=1
#
function die(msg) {
    print "Error: " msg > "/dev/stderr";
    exit(1);
}
function getHours(sec) {
    return sprintf("%0.3f", (sec/(60.0*60.0)));
}

BEGIN {
    OFS="\t";
}

# begin line, get host
# hgw1 2006.07.20-16:30:02 dbload: begin
/dbload: begin$/ {
    host = $1;
    hosts[host] = 1;
}

# Count of sequences changes
# gbLoadRna: genbank.136.0 hg16 native,xeno mrna: delete=0 seqChg=0 metaChg=0 extChg=0 new=552880 orphan=0 noChg=0
/^gbLoadRna: .* delete=/ {
    db=$4;
    if ($3 ~ /^genbank/) {
        srcDb = "genbank";
    } else if ($3 ~ /^refseq/) {
        srcDb = "refseq";
    } else {
        die("can't parse srcDb: " $0);
    }
    if ($6 ~ /^mrna/) {
        type = "mrna";
    } else if ($6 ~ /^est/) {
        type = "est";
    } else {
        die("can't parse type: " $0);
    }
    key = host "\t" db "\t" srcDb "\t" type;
    keys[key] = 1;
    if (match($0, "delete=([0-9]*) seqChg=([0-9]*) metaChg=([0-9]*) extChg=([0-9]*) new=([0-9]*) orphan=([0-9]*) noChg=([0-9]*)", parts) == 0) {
        die("can't parse counts from: " $0);
    }
    deleteCnt[key] += parts[1];
    seqChgCnt[key] += parts[2];
    metaChgCnt[key] += parts[3];
    extChgCnt[key] += parts[4];
    newCnt[key] += parts[5];
    orphanCnt[key] += parts[6];
    noChgCnt[key] += parts[7];

    deleteTotalCnt[host] += parts[1];
    seqChgTotalCnt[host] += parts[2];
    metaChgTotalCnt[host] += parts[3];
    extChgTotalCnt[host] += parts[4];
    newTOtalCnt[host] += parts[5];
    orphanTotalCnt[host] += parts[6];
    noChgTotalCnt[host] += parts[7];
}

# load time
# <-load for genbank.154.0 anoGam1 native,xeno mrna: real=143.37 cpu=23.13 sys=10.31 wait=109.93 childCpu=0.04 childSys=0.04 tod=2006-07-20T16:32:38
/<-load for .* real=/ {
    if (match($0, "real=([0-9.]*)", parts) == 0) {
        die("can't parse load time: " $0);
    }
    dbLoadTime[key] += parts[1];
    dbLoadTotalTime[host] += parts[1];
}

END {
    # report by db/srcDb/type
    print "host", "db","srcdb","type", "delete","seqChg","metaChg","extChg","new","orphan","noChg","hours";
    for (key in keys) {
        print key, deleteCnt[key], seqChgCnt[key], metaChgCnt[key], extChgCnt[key], newCnt[key], orphanCnt[key], noChgCnt[key],getHours(dbLoadTime[key]);
    }
    for (host in hosts) {
        print host,"total","","", deleteTotalCnt[host], seqChgTotalCnt[host], metaChgTotalCnt[host], extChgTotalCnt[host], newTotalCnt[host], orphanTotalCnt[host], noChgTotalCnt[host],getHours(dbLoadTotalTime[host]);
    }
}
