#!/usr/bin/awk -f
#
#  acc-list-cmp expectAccs gotAccs
#
# Compare two files containing lists of accessions.  The accessions may
# occur multiple times.  Output is a file with the columns:
#     acc change delta
#
# change is: 
#    new - acc is new in gotAccs, delta is the number added
#    del - acc is deleted in gotAccs, delta is the negative of the number
#          deleted
#    chg - +/- change in number

BEGIN {
    OFS = "\t";
}

{
    # table counted in depends on file
    acc = $1;
    if (ARGIND == 1) {
        expectAcc[acc]++;
    } else {
        gotAcc[acc]++;
    }
    allAcc[acc] = acc;
}

END {
# change to a indexed and sorted array
    numAcc = asort(allAcc);
    for (i = 1; i <= numAcc; i++) {
        acc = allAcc[i];
        expectCnt = (expectAcc[acc] == "") ? 0 : expectAcc[acc];
        gotCnt = (gotAcc[acc] == "") ? 0 : gotAcc[acc];
        if (gotCnt == 0) {
            stat = "del";
        } else if (expectCnt == 0) {
            stat = "new";
        } else if (gotCnt != expectCnt) {
            stat = "chg";
        } else {
            stat = "same";
        }
        if (stat != "same") {
            print acc,stat,(gotCnt-expectCnt);
        }
    }
}
