# for emacs: -*- mode: sh; -*-

# This file describes how we made the browser database on
# the Patch 2 release for the NCBI build 37 (February 2009 freeze) aka:
#	GRCh37.p2 - Genome Reference Consortium Human Reference 37

############################################################################
# gather sequence and AGP definitions (DONE - 2010-08-26 - Hiram)
    mkdir -p /hive/data/genomes/hg19Patch2/sequence
    cd /hive/data/genomes/hg19Patch2/sequence
    # a round about way here since patch2 sequence was already assembled.
    # there are perl and shell scripts in
    # ../../hg19/bed/additionalSequence/patch2
    #	which created the fasta file with UCSC names
    # see also in hg19.txt:
    # Updating to patch2 sequence (DONE - 2010-08-18 - Hiram)

    ln -s ../../hg19/bed/additionalSequence/patch2/patch2.ucsc.fa .
    ln -s ../../hg19/17_ctg5_hap1/chr17_ctg5_hap1.fa .
    ln -s ../../hg19/4_ctg9_hap1/chr4_ctg9_hap1.fa .
    # the fasta files in hg19 have incorrect headers:
for C in 6_apd_hap1 6_cox_hap2 6_dbb_hap3 6_mann_hap4 6_mcf_hap5 \
        6_qbl_hap6 6_ssto_hap7
do
    rm -f chr${C}.fa
    echo ">chr${C}" > chr${C}.fa
    grep -v "^>" ../../hg19/${C}/*.fa >> chr${C}.fa
done

    cat << '_EOF_' > patch2Agp.pl
#!/usr/bin/env perl

use strict;
use warnings;

sub usage() {
    printf STDERR "usage: ./patch2Agp.pl ../../hg19/bed/additionalSequence/patch2/patches.chrom.sizes \\\n";
    printf STDERR " ../../hg19/bed/additionalSequence/patches/ucscNames.patch2.txt \\\n";
    printf STDERR " ../../hg19/bed/additionalSequence/patches/patch_release_2/PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz\n";
}

my $argc = scalar(@ARGV);

if ($argc < 3) {
    usage;
    exit 255;
}

my %skipSequence;
$skipSequence{"GL339449.1"} = 1;
$skipSequence{"GL339450.1"} = 1;

my $sizes = shift;	# patches.chrom.sizes
my $names = shift;	# patches/ucscNames.txt
my $agpFile = shift;	# alt.scaf.agp.gz

my %glToChr;
my %chrToCtg;
my %fastaToChr;
my %chrToSize;

open(FH, "<$sizes") or die "can not read $sizes";
while (my $line = <FH>) {
    chomp $line;
    my ($chr, $size) = split('\s+', $line);
    $chrToSize{$chr} = $size;
}
close (FH);

open(FH, "<$names");
while (my $line = <FH>) {
    chomp $line;
    my ($faName, $ctg, $cmName, $chr) = split('\s+', $line);
    $faName =~ s/.*gb.GL/GL/;
    next if (exists($skipSequence{$faName}));
    my $size = $chrToSize{$chr};
    if (exists($glToChr{$faName})) {
	if ($glToChr{$faName} ne $chr) {
	    printf STDERR "ERROR: contig name: $faName was chr name: $glToChr{$faName}\n";
	    printf STDERR " now claiming to be chr name: $chr\n";
	    exit 255;
	}
    } else {
	$glToChr{$faName} = $chr;
    }
    die "not defined faName" if (!defined($faName));
    die "not defined $faName $chr size" if (!defined($size));
}
close (FH);

my $prevObj = "";
my $newIx = 1;
open (FH,"zcat $agpFile|") or die "can not read $agpFile";
while (my $line = <FH>) {
    next if ($line =~ m/^\s*#/);
    chomp $line;
    my ($object, $objStart, $objEnd, $ix, $type, $frag, $fragStart, $fragEnd, $strand) = split('\s+', $line);
    next if (exists($skipSequence{$object}));
    die "ERROR: can not find contig $object to chr name"
	if (!exists($glToChr{$object}));
    $newIx = 1 if ($prevObj ne $object);
    my $chr = $glToChr{$object};
    if ($type eq "N") {
	# frag is size, fragStart is type of gap, and fragEnd is bridged y/n
	printf "%s\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n",
	    $chr, $objStart, $objEnd, $newIx, $type, $frag, $fragStart,
	    $fragEnd;
    } else {
	printf "%s\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n",
	    $chr, $objStart, $objEnd, $newIx, $type, $frag, $fragStart,
	    $fragEnd, $strand;
    }
    ++$newIx;
    $prevObj = $object;
#    printf "%s\n", $line;
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x patch2Agp.pl

    zcat \
../../hg19/bed/additionalSequence/patches/patch_release_1/AGP/alt.scaf.agp.gz \
    | grep "^GL" | sed -e "s/GL339449.1/chr5_ctg1_gl339449/; s/GL339450.1/chr9_gl339450/" > hg19Patch2.agp

    ./patch2Agp.pl \
	../../hg19/bed/additionalSequence/patch2/patches.chrom.sizes \
        ../../hg19/bed/additionalSequence/patches/ucscNames.patch2.txt \
        ../../hg19/bed/additionalSequence/patches/patch_release_2/PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz \
    >> hg19Patch2.agp

for H in chr17_ctg5_hap1 chr4_ctg9_hap1 chr6_apd_hap1 chr6_cox_hap2 \
        chr6_dbb_hap3 chr6_mann_hap4 chr6_mcf_hap5 chr6_qbl_hap6 \
        chr6_ssto_hap7 chrM_rCRS
do
    grep "^${H}" /hive/data/genomes/hg19/hg19.agp
done >> hg19Patch2.agp

echo -e "chrM_rCRS\t1\t16569\t1\tF\tNC_012920\t1\t16569\t+" >> hg19Patch2.agp

    # verify we have correct sequence and AGP file:
    faToTwoBit *.fa patch2.2bit
    checkAgpAndFa  hg19Patch2.agp patch2.2bit
    # All AGP and FASTA entries agree - both files are valid

###########################################################################
# Build the browser (DONE - 2010-08-26 - Hiram)
    cd /hive/data/genomes/hg19Patch2
    cat << '_EOF_' > hg19Patch2.config.ra
# Config parameters for makeGenomeDb.pl:
db hg19Patch2
scientificName Homo sapiens
commonName GRCh37.p2
assemblyDate Aug. 2009
assemblyLabel GRCh37 Patch 2 Genome Reference Consortium Human Reference 37 (GCA_000001405.1)
orderKey 14
mitoAcc none
fastaFiles /hive/data/genomes/hg19Patch2/sequence/*.fa
agpFiles /hive/data/genomes/hg19Patch2/sequence/hg19Patch2.agp
# qualFiles /dev/null
dbDbSpeciesDir human
taxId   9606
clade haplotypes
genomeCladePriority 139
assemblyShortLabel GRCh37.p2
'_EOF_'
    # << happy emacs

    makeGenomeDb.pl -dbHost=hgwdev -fileServer=hgwdev -workhorse=hgwdev \
	-noGoldGapSplit hg19Patch2.config.ra > makeGenomeDb.log 2>&1

    featureBits -countGaps hg19Patch2 gap
# 5650321 bases of 49709657 (11.367%) in intersection

###########################################################################
# RepeatMasker (DONE - 2010-08-26 - Hiram)
    mkdir /hive/data/genomes/hg19Patch2/bed/repeatMasker
    cd /hive/data/genomes/hg19Patch2/bed/repeatMasker
    doRepeatMasker.pl hg19Patch2 -buildDir=`pwd` -noSplit -bigClusterHub=pk \
        -dbHost=hgwdev -workhorse=hgwdev > do.log 2>&1
    cat faSize.rmsk.txt
# 49709657 bases (5650322 N's 44059335 real 21233685 upper 22825650 lower)
#	in 80 sequences in 1 files
# %45.92 masked total, %51.81 masked real

###########################################################################
# TRF simple repeats (DONE - 2010-08-26 - Hiram)
    mkdir /hive/data/genomes/hg19Patch2/bed/simpleRepeat
    cd /hive/data/genomes/hg19Patch2/bed/simpleRepeat
    doSimpleRepeat.pl hg19Patch2 -buildDir=`pwd` -dbHost=hgwdev \
        -smallClusterHub=pk -workhorse=hgwdev > do.log 2>&1
    cat fb.simpleRepeat 
# 1328856 bases of 44059336 (3.016%) in intersection

    twoBitMask hg19Patch2.rmsk.2bit \
        -add bed/simpleRepeat/trfMask.bed hg19Patch2.2bit
    twoBitToFa hg19Patch2.2bit stdout | faSize stdin \
	> faSize.hg19Patch2.2bit.txt
# 49709657 bases (5650322 N's 44059335 real 21214276 upper 22845059 lower)
#	in 80 sequences in 1 files
# %45.96 masked total, %51.85 masked real


    time blat hg19Patch2.2bit \
        /dev/null /dev/null -tileSize=11 -makeOoc=hg19Patch2.11.ooc \
	-repMatch=1024
# Wrote 41 overused 11-mers to hg19Patch2.11.ooc
    mkdir /hive/data/staging/data/hg19Patch2
    cp -p hg19Patch2.2bit /hive/data/staging/data/hg19Patch2
    cp -p hg19Patch2.11.ooc /hive/data/staging/data/hg19Patch2
    cp -p chrom.sizes /hive/data/staging/data/hg19Patch2

###########################################################################
# ctgPos track (DONE - 2010-08-26 - Hiram)
    mkdir /hive/data/genomes/hg19Patch2/bed/ctgPos
    cd /hive/data/genomes/hg19Patch2/bed/ctgPos
    for C in `cut -f1 ../../chrom.sizes`
do
    ctgPos=`hgsql -N -e 'select * from ctgPos where chrom="'$C'";' hg19`
    if [ "x${ctgPos}y" = "xy" ]; then
        GL=`echo $C | sed -e "s/.*_gl//"`
        glAcc=`grep ${GL} /hive/data/genomes/hg19/bed/additionalSequence/patches/patch_release_2/PATCHES/localID2acc | cut -f2`
        glSize=`grep ${GL} /hive/data/genomes/hg19Patch2/chrom.sizes | cut -f2`
        echo -e "$glAcc\t$glSize\t${C}\t0\t$glSize"
    else
        echo "$ctgPos"
    fi
done > ctgPos.txt

    hgLoadSqlTab hg19Patch2 ctgPos ctgPos.sql ctgPos.txt
    featureBits -countGaps hg19Patch2 ctgPos
# 49709657 bases of 49709657 (100.000%) in intersection

###########################################################################
# ctgPos2 track (DONE - 2010-08-26 - Hiram)
    mkdir /hive/data/genomes/hg19Patch2/bed/ctgPos2
    cd /hive/data/genomes/hg19Patch2/bed/ctgPos2

for C in `cut -f1 ../../chrom.sizes`
do
    ctgPos2=`hgsql -N -e 'select * from ctgPos2 where chrom="'$C'";' hg19`
    if [ "x${ctgPos}y" = "xy" ]; then
        GL=`echo $C | sed -e "s/.*_gl//"`
        glSize=`grep ${GL} /hive/data/genomes/hg19Patch2/chrom.sizes | cut -f2`
        glAcc=`grep ${GL} /hive/data/genomes/hg19/bed/additionalSequence/patches/patch_release_2/PATCHES/localID2acc | cut -f1`
        if [ "x${glAcc}y" = "xy" ]; then
            GL=`echo $C | sed -e "s/_hap.*//" | sed -e "s/chr.*_/_/" | tr '[a-z]' '[A-Z]'`
            glAcc=`grep -h ${GL} /hive/data/genomes/hg19/download/alternate_loci/ALT_REF_LOCI_?/localID2acc | cut -f1`
        fi
        echo -e "$glAcc\t$glSize\t${C}\t0\t$glSize\tF"
    else
        echo -e "$ctgPos2\tF"
    fi
done | grep -v chrM > ctgPos2.tab

echo -e "NC_012920.1\t16569\tchrM_rCRS\t0\t16569\tF" >> ctgPos2.tab

    sed -e "s/chrom(16)/chrom(19)/" $HOME/kent/src/hg/lib/ctgPos2.sql \
	> ctgPos2.sql
    hgLoadSqlTab hg19Patch2 ctgPos2 ctgPos2.sql ctgPos2.tab

    featureBits -countGaps hg19Patch2 ctgPos2
# 49709657 bases of 49709657 (100.000%) in intersection

###########################################################################
# altSequence track (DONE - 2010-08-26 - Hiram)
    # provide links to locations on reference genome where these patches and
    # haplotypes belong
    mkdir /hive/data/genomes/hg19Patch2/bed/altSequence
    cd /hive/data/genomes/hg19Patch2/bed/altSequence
    ln -s ../../../hg19/bed/additionalSequence/patch2/altSequence.bed \
	altSeq.bed.0

    cat altSeq.bed.0 | while read L
do
    C=`echo "${L}" | awk '{print $4}'`
    hg19C=`echo "${L}" | awk '{print $1}'`
    hg19S=`echo "${L}" | awk '{print $2}'`
    hg19E=`echo "${L}" | awk '{print $3}'`
    S=`grep "^${C}" ../../chrom.sizes | cut -f2`
    echo $C $S $hg19C $hg19S $hg19E | awk '{printf "%s\t0\t%d\t%s:%d-%d\t", $1, $2, $3, $4, $5}'
    echo "${L}" | awk '{printf "%d\t%s\t%d\t%d\t%s\n", $5,$6,$7,$8,$9}'
done | grep -v "chrM_rCRS:" > altSequence.tab

    hgLoadBed hg19Patch2 altSequence altSequence.tab
    featureBits -countGaps hg19Patch2 altSequence
# 49709657 bases of 49709657 (100.000%) in intersection

############################################################################
# create lift file on unBridged gaps for genbank splits (2009-03-09 - Hiram)
    mkdir /hive/data/genomes/hg19Patch2/bed/gap
    cd /hive/data/genomes/hg19Patch2/bed/gap
    gapToLift hg19Patch2 hg19Patch2.unBridged.lift -bedFile=unBridged.lift.bed
    cp -p hg19Patch2.unBridged.lift ../../jkStuff
    cp -p hg19Patch2.unBridged.lift /hive/data/staging/data/hg19Patch2

###########################################################################
# AUTO UPDATE GENBANK RUN  (DONE - 2010-08-27 - Hiram)
    # align with latest genbank process.
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # edit etc/genbank.conf to add hg19Patch2 just after hg19

# hg19Patch2 - GRCh37.p2 - Genome Reference Consortium Human Reference 37
hg19Patch2.serverGenome = /hive/data/genomes/hg19Patch2/hg19Patch2.2bit
hg19Patch2.clusterGenome = /scratch/data/hg19Patch2/hg19Patch2.2bit
hg19Patch2.ooc = /scratch/data/hg19Patch2/hg19Patch2.11.ooc
hg19Patch2.lift = /hive/data/genomes/hg19/jkStuff/hg19Patch2.unBridged.lift
hg19Patch2.refseq.mrna.native.pslCDnaFilter  = ${finished.refseq.mrna.native.pslCDnaFilter}
hg19Patch2.refseq.mrna.xeno.pslCDnaFilter    = ${finished.refseq.mrna.xeno.pslCDnaFilter}
hg19Patch2.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
hg19Patch2.genbank.mrna.xeno.pslCDnaFilter   = ${finished.genbank.mrna.xeno.pslCDnaFilter}
hg19Patch2.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter}
hg19Patch2.genbank.est.xeno.pslCDnaFilter   = ${finished.genbank.est.xeno.pslCDnaFilter}
hg19Patch2.genbank.est.xeno.load = no
hg19Patch2.genbank.est.xeno.loadDesc = no
hg19Patch2.genbank.mrna.xeno.load = no
hg19Patch2.genbank.mrna.xeno.loadDesc = no
hg19Patch2.refseq.mrna.xeno.load  = no
hg19Patch2.refseq.mrna.xeno.loadDesc = no
hg19Patch2.mgc = yes
hg19Patch2.orfeome = yes
hg19Patch2.downloadDir = hg19Patch2
hg19Patch2.ccds.ncbiBuild = 37.1
hg19Patch2.genbank.mrna.blatTargetDb = yes
hg19Patch2.perChromTables = no

    git commit -m "adding hg19Patch2" genbank.conf
    git push

    # update /cluster/data/genbank/:
    make etc-update

    ssh genbank
    screen		#	use a screen to manage this job
    cd /cluster/data/genbank
    time nice -n +19 bin/gbAlignStep -initial hg19Patch2 &
    #	logFile: var/build/logs/2010.08.27-12:00:05.hg19Patch2.initalign.log
    #	real    663m19.732s

    # load database when finished
    ssh hgwdev
    screen	# use screen to manage this long running command
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad hg19Patch2 &
    #	logFile: var/dbload/hgwdev/logs/2010.08.28-08:46:41.dbload.log
    #	real    60m50.989s

    # enable daily alignment and update of hgwdev (DONE - 2009-02-24 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add hg19Patch2 to:
        etc/align.dbs
        etc/hgwdev.dbs
    git commit -m "Added hg19Patch2 - Human - GRCh37.p2" etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

############################################################################
# new blat server for the hg19.patch2 sequence (DONE - 2010-08-27 - Hiram)
    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("hg19Patch2", "blat4", "17792", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("hg19Patch2", "blat4", "17793", "0", "1");' \
	    hgcentraltest

############################################################################
# lastz alignment to hg19 (DONE - 2010-08-30 - Hiram)
    mkdir /hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30
    cd /hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30
    #	construct a 2bit file of just the hg19 reference sequences:
mkdir -p hg19Bits

export A=-1
awk '{print $4}' ../altSequence/altSequence.tab | while read P
do
    H=`grep "${P}" ../altSequence/altSequence.tab | awk '{print $1}'`
    HE=`grep "${P}" ../altSequence/altSequence.tab | awk '{print $3}'`
    C=`echo ${P} | sed -e "s/:.*//"`
    CE=`grep "^${C}" /hive/data/genomes/hg19/chrom.sizes | cut -f2 | head -1`
    SE=`echo ${P} | sed -e "s/.*://"`
    S=`echo ${SE} | sed -e "s/-.*//" | awk '{printf "%d", $1-1}'`
    E=`echo ${SE} | sed -e "s/.*-//"`
    size=`echo $E $S | awk '{printf "%d", $1-$2}'`
    A=`echo ${A} | awk '{printf "%02d", $1+1}'`
    echo -e "$S\t$C.$A\t$size\t$C\t$CE"
    echo ">$C.$A" > hg19Bits/$C.$A.fa
    twoBitToFa /gbdb/hg19/hg19.2bit:${C}:$S-$E stdout \
	| grep -v "^>" >> hg19Bits/$C.$A.fa
done > hg19Bits.lift

    faToTwoBit hg19Bits/chr*.fa hg19Bits.2bit
    twoBitInfo hg19Bits.2bit stdout | sort -k2nr > hg19Bits.chrom.sizes

    cat << '_EOF_' > DEF
# human vs human
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
# lastz does not like the O= and E= lines in the matrix file
#       this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from hg18 vs venter1 lastz on advice from Webb
BLASTZ_K=10000
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Human Hg19Patch2
SEQ1_DIR=/scratch/data/hg19Patch2/hg19Patch2.2bit
SEQ1_LEN=/scratch/data/hg19Patch2/chrom.sizes
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
SEQ1_LIMIT=2

# QUERY: Human Hg19
SEQ2_DIR=/scratch/data/hg19/hg19.2bit
SEQ2_LEN=/scratch/data/hg19/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30/hg19Bits.2bit
SEQ2_CTGLEN=/hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30/hg19Bits.chrom.sizes
SEQ2_CHUNK=5000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
SEQ2_LIMIT=2

BASE=/hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
	-noDbNameCheck  -stop=net \
	-noLoadChainSplit -chainMinScore=2000 \
	-chainLinearGap=medium -workhorse=hgwdev \
	-smallClusterHub=encodek -bigClusterHub=swarm > net.log 2>&1
    #	Elapsed time: 3m38s

    time doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
	-noDbNameCheck  -continue=load -stop=load \
	-noLoadChainSplit -chainMinScore=2000 \
	-chainLinearGap=medium -workhorse=hgwdev \
	-smallClusterHub=encodek -bigClusterHub=swarm > load.log 2>&1
    #	real    0m54.258s
    cat fb.hg19Patch2.chainHg19Link.txt
    #	42748025 bases of 44059336 (97.024%) in intersection

    # rework that job list to run these guys only against their bit of
    #	the reference sequence.
    cd /hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30

    # move all of these out of the way until the next one is done
    mv psl psl.0
    mv run.blastz run.blastz.0
    mv pslParts pslParts.0
    mv axtNet axtNet.0
    mv mafNet mafNet.0
    # start a new lastz run with a specific jobList
    mkdir -p run.blastz/tParts run.blastz/qParts
export A=-1
awk '{print $4}' ../altSequence/altSequence.tab | while read P
do
    H=`grep "${P}" ../altSequence/altSequence.tab | awk '{print $1}'`
    HE=`grep "${P}" ../altSequence/altSequence.tab | awk '{print $3}'`
    C=`echo ${P} | sed -e "s/:.*//"`
    CE=`grep "^${C}" /hive/data/genomes/hg19/chrom.sizes | cut -f2 | head -1`
    SE=`echo ${P} | sed -e "s/.*://"`
    S=`echo ${SE} | sed -e "s/-.*//" | awk '{printf "%d", $1-1}'`
    E=`echo ${SE} | sed -e "s/.*-//"`
    size=`echo $E $S | awk '{printf "%d", $1-$2}'`
    A=`echo ${A} | awk '{printf "%02d", $1+1}'`
    echo -e "/hive/data/staging/data/hg19Patch2/hg19Patch2.2bit:$H:0-$HE" \
        > run.blastz/tParts/$A.lst
    echo -e "/hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30/hg19Bits.2bit:$C.$A:0-$size" \
        > run.blastz/qParts/$A.lst
    echo -e "/cluster/bin/scripts/blastz-run-ucsc -outFormat psl tParts/$A.lst qParts/$A.lst ../DEF {check out exists ../psl/$H.$C.$A.psl }"
done > run.blastz/jobList
    ssh swarm
    cd /hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30/run.blastz
    para create jobList
    para try
    para check ... push ... check
    para time
# Completed: 80 of 80 jobs
# CPU time in finished jobs:        161s       2.69m     0.04h    0.00d  0.000 y
# IO & Wait Time:                   238s       3.96m     0.07h    0.00d  0.000 y
# Average job time:                   5s       0.08m     0.00h    0.00d
# Longest finished job:              20s       0.33m     0.01h    0.00d
# Submission to last job:            37s       0.62m     0.01h    0.00d

    #	put together the individual results:
    cd /hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30
    cat psl/chr*.psl | gzip -c > pslParts/hg19Patch2.hg19.psl.gz

    #	constructing a chain from those results
    cd /hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30/axtChain/run
zcat ../../pslParts/hg19Patch2.hg19.psl.gz \
| axtChain -psl -verbose=0 -scoreScheme=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q  -minScore=2000 -linearGap=medium stdin \
    /scratch/data/hg19Patch2/hg19Patch2.2bit \
    /hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30/hg19Bits.2bit \
    stdout \
| chainAntiRepeat /scratch/data/hg19Patch2/hg19Patch2.2bit \
    /hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30/hg19Bits.2bit \
    stdin hg19Patch2.hg19.preLift.chain
liftUp -chainQ hg19Patch2.hg19.lifted.chain \
    ../../hg19Bits.lift carry hg19Patch2.hg19.preLift.chain

    # constructing the net files:
cd /hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30/axtChain

chainMergeSort run/hg19Patch2.hg19.lifted.chain | nice gzip -c > hg19Patch2.hg19.all.chain.gz
chainSplit chain hg19Patch2.hg19.all.chain.gz
# Make nets ("noClass", i.e. without rmsk/class stats which are added later):
chainPreNet  hg19Patch2.hg19.all.chain.gz /scratch/data/hg19Patch2/chrom.sizes /scratch/data/hg19/chrom.sizes stdout \
| chainNet  stdin -minSpace=1 /scratch/data/hg19Patch2/chrom.sizes /scratch/data/hg19/chrom.sizes stdout /dev/null \
| netSyntenic stdin noClass.net

# Make liftOver chains:
netChainSubset -verbose=0 noClass.net hg19Patch2.hg19.all.chain.gz stdout \
| chainStitchId stdin stdout | gzip -c > hg19Patch2.hg19.over.chain.gz

# Make axtNet for download: one .axt per hg19Patch2 seq.
netSplit noClass.net net
cd ..
mkdir -p axtNet
foreach f (axtChain/net/*.net)
netToAxt $f axtChain/chain/$f:t:r.chain \
  /scratch/data/hg19Patch2/hg19Patch2.2bit /scratch/data/hg19/hg19.2bit stdout \
  | axtSort stdin stdout \
  | gzip -c > axtNet/$f:t:r.hg19Patch2.hg19.net.axt.gz
end

# Make mafNet for multiz: one .maf per hg19Patch2 seq.
mkdir -p mafNet
foreach f (axtNet/*.hg19Patch2.hg19.net.axt.gz)
  axtToMaf -tPrefix=hg19Patch2. -qPrefix=hg19. $f \
        /scratch/data/hg19Patch2/chrom.sizes /scratch/data/hg19/chrom.sizes \
        stdout \
  | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
end

    # no need for these as reference
    rm -fr psl.0 run.blastz.0 pslParts.0 axtNet.0 mafNet.0

    # swap that business to hg19
    mkdir /hive/data/genomes/hg19/bed/blastz.hg19Patch2.swap
    cd /hive/data/genomes/hg19/bed/blastz.hg19Patch2.swap
    time doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30/DEF \
	-swap -noDbNameCheck  -stop=load \
	-noLoadChainSplit -chainMinScore=2000 \
	-chainLinearGap=medium -workhorse=hgwdev \
	-smallClusterHub=encodek -bigClusterHub=swarm > swap.load.log 2>&1

    # and then fixup the chains to include the haplotypes
    cd /hive/data/genomes/hg19/bed/blastz.hg19Patch2.swap/axtChain
    # split up each chain by the hg19Patch2 query sequences
    mkdir -p queryChains
    chainSplit -q queryChains hg19.hg19Patch2.all.chain.gz

    # then run a 'lift over' chain/net on each single one
    mkdir -p singleLiftOver

for F in queryChains/*.chain
do
    C=`basename ${F}`
    B=`echo ${C} | sed -e "s/.chain//"`
    chainPreNet -inclHap ${F} /scratch/data/hg19/chrom.sizes \
        /scratch/data/hg19Patch2/chrom.sizes stdout \
    | chainNet -inclHap stdin -minSpace=1 /scratch/data/hg19/chrom.sizes \
        /scratch/data/hg19Patch2/chrom.sizes singleLiftOver/${B}.raw.net \
        /dev/null
    netSyntenic singleLiftOver/${B}.raw.net singleLiftOver/${B}.noClass.net
    netFilter -chimpSyn singleLiftOver/${B}.noClass.net > singleLiftOver/${B}.chimpSyn.net
    netChainSubset -verbose=0 singleLiftOver/${B}.noClass.net \
        ${F} stdout \
    | chainStitchId stdin stdout > singleLiftOver/${C}
    echo "${F} -> singleLiftOver/${C}"
done
    # put the chains together into one file
    chainMergeSort singleLiftOver/chr*.chain | gzip -c \
	> hg19.hg19Patch2.single.over.chain.gz

    # construct psl files from those chains
    chainToPsl hg19.hg19Patch2.single.over.chain.gz \
	/hive/data/genomes/hg19/chrom.sizes \
        /hive/data/genomes/hg19Patch2/chrom.sizes \
        /hive/data/genomes/hg19/hg19.2bit \
        /hive/data/genomes/hg19Patch2/hg19Patch2.2bit \
        hg19.hg19Patch2.over.psl
    # chainToPsl appears to have a problem, note errors from pslCheck:
    pslCheck -db=hg19 hg19.hg19Patch2.over.psl
# Error: invalid PSL: chr6_ssto_hap7:3797750-3798078 chr6:32538701-32539032 + hg19.hg19Patch2.over.psl:362
# alignment size (328) doesn't match counts (0)
    pslRecalcMatch hg19.hg19Patch2.over.psl \
	/hive/data/genomes/hg19/hg19.2bit \
	/hive/data/genomes/hg19Patch2/hg19Patch2.2bit \
	fixup.hg19.hg19Patch2.over.psl
    pslCheck -db=hg19 fixup.hg19.hg19Patch2.over.psl
    checked: 391 failed: 0 errors: 0

    # load this PSL track
    hgLoadPsl hg19 -table=altSeqLiftOverPsl fixup.hg19.hg19Patch2.over.psl

############################################################################
