#!/bin/sh
#
# Trawl pcp-daily log directories looking for _all_ the failures
# for one or more tests
#
# Look in $HOME/Logs/by-vm if it exists, else the current directory.
# Expect to find a bunch of subdirs like vm00, vm01, ... bozo, bozo-vm,
# etc and within those, a qa subdir which contains the .out and .out.bad
# (and .full) files for the failing tests.
#

tmp=/var/tmp/$$
sts=1
trap "rm -f $tmp.*; exit \$sts" 0 1 2 3 15

_usage()
{
    echo "Usage: $0 [options] [seq ...]"
    echo "Options:"
    echo "  -C    include results from CI"
    echo "  -d    debug"
    echo "  -f    show me seq.full if it exists"
    echo "  -g    path to base of PCP git tree [default: $HOME/src/pcp]"
    echo "  -l    run show-me -l"
    echo "  -m    gather unique .out.bad files and prepare email"
    echo "  -t    report triaged tests (default is to skip 'em)"
    sts=1
    exit
}

_map_ci_hostname()
{
    sed \
	-e 's/^test-/ci-/' \
	-e 's/-container$//' \
    # end
}

full=false
pcpdir=$HOME/src/pcp
verbose=false
show_me_opts=''
prepare_mail=false
debug=false
CI=false
triaged=false
while getopts 'Cdfg:lmt?' p
do
    case "$p"
    in
	C)	CI=true
		;;

	d)	debug=true
		;;

	f)	full=true
		;;

	g)	pcpdir="$OPTARG"
		;;

	l)	show_me_opts="$show_me_opts -l"
		;;

	m)	prepare_mail=true
		;;

	t)	triaged=true
		;;

	?)	_usage
		# NOTREACHED
    esac
done
shift `expr $OPTIND - 1`
if $prepare_mail && [ $# -ne 1 ]
then
    echo "Error: -m needs exatly one seq argument"
    exit
fi

if [ ! -d $pcpdir/qa ]
then
    echo "Error: $pcpdir/qa does not exist ... expecting PCP qa dir"
    exit
fi

if [ -d $HOME/Logs/by-vm ]
then
    cd $HOME/Logs/by-vm
    here=`pwd`
else
    echo "Error: $HOME/Logs/by-vm does not exist"
    exit
fi

if [ $# -eq 0 ]
then
    set -- `find -L * -name "*.out.bad" \
            | sed -e 's;.*/;;' -e 's/\.out.bad$//' \
	    | sort -n \
	    | uniq`
fi

$debug && echo >&2 "seqs=$*"
    
for seq
do
    case "$seq"
    in
	[0-9])
		seq=00$seq
		;;
	[0-9][0-9])
		seq=0$seq
		;;
    esac
    base=''
    echo >$tmp.sum
    if $CI
    then
	dirs=*
    else
	dirs='b* vm*'
    fi
    find -L $dirs -name "$seq.out.bad" \
    | sort \
    | while read bad
    do
	$debug && echo >&2 "bad=$bad"
	cd $here
	sum=`shasum <$bad | sed -e 's/ .*//'`
	host=`echo "$bad" | sed -e 's/\/.*//'`
	if [ ! -d `dirname $bad` ]
	then
	    echo "Arrgh: bad=$bad but dir=`dirname $bad` is not a directory!"
	    exit
	fi
	cd `dirname $bad`
	$debug && echo >&2 "pwd=`/bin/pwd`"
	rm -f $tmp.failed $tmp.triaged
	for log in `ls -r ../????-??-?? test.log 2>/dev/null`
	do
	    if $CI
	    then
		:
	    else
		[ "$log" = test.log ] && continue
	    fi
	    if grep '^Failures: ' $log >$tmp.tmp
	    then
		# found a Failures: line ... is our test included in the
		# last Failures: line?
		#
		if tail -1 $tmp.tmp | grep " $seq" >/dev/null
		then
		    touch $tmp.failed
		    break
		fi
	    fi
	    if grep '^Triaged: ' $log >$tmp.tmp
	    then
		# found a Triaged: line ... is our test included in the
		# last Triaged: line?
		#
		if tail -1 $tmp.tmp | grep " $seq" >/dev/null
		then
		    touch $tmp.triaged
		    break
		fi
	    fi
	done
	if [ ! -f $tmp.failed -a ! -f $tmp.triaged ]
	then
	    # cleanup because test was subsequently made to pass?
	    #
	    echo -n "$seq: not a failure in any $host daily log ... clean up? [n] "
	    read ans </dev/tty
	    if [ -n "$ans" -a "$ans" = y ]
	    then
		rm $seq.*
	    fi
	    continue
	fi
	[ -f $tmp.triaged -a "$triaged" = false ] && continue
	if grep " $sum" <$tmp.sum >/dev/null 2>&1
	then
	    match_host=`grep " $sum" <$tmp.sum | sed -e 's/ .*//' -e 1q`
	    echo "$host: same $seq.out.bad as $match_host"
	else
	    for qabits in \
		common common.check common.config common.filter common.install.cisco \
		common.pcpweb common.product common.rc common.setup localconfig \
		group show-me
	    do
		if [ -L $qabits ]
		then
		    :
		else
		    rm -f $qabits
		    ln -s $pcpdir/qa/$qabits $qabits
		fi
	    done
	    [ ! -f $seq.out ] && ln -s $pcpdir/qa/$seq.out $seq.out
	    echo -n "$host-"
	    [ -f $tmp.triaged ] && echo -n "triaged-"
	    [ -f qa/$seq.out.bad ] && cd qa
	    show-me $show_me_opts $seq
	    $full && [ -f $seq.full ] && less $seq.full </dev/tty
	fi
	echo $host $sum >>$tmp.sum
	$debug && ( echo >&2 "sum ..."; cat >&2 $tmp.sum )
    done

done

if $prepare_mail
then
    var=0
    rm -rf /tmp/show-me-all
    if mkdir /tmp/show-me-all
    then
	:
    else
	echo "Arrgh: cannot mkdir /tmp/show-me-all"
	exit
    fi
    rm -f $tmp.map $tmp.mail
    touch $tmp.map
    cat $tmp.sum \
    | while read host sum
    do
	[ -z "$host" ] && continue
	myvar=`grep " $sum\$" $tmp.map | sed -e 's/ .*//'`
	if [ -z "$myvar" ]
	then
	    if [ -f $HOME/Logs/by-vm/$host/qa/$seq.out.bad ]
	    then
		cp $HOME/Logs/by-vm/$host/qa/$seq.out.bad /tmp/show-me-all/$seq.out.bad-$var
	    elif [ -f $HOME/Logs/by-vm/$host/$seq.out.bad ]
	    then
		cp $HOME/Logs/by-vm/$host/$seq.out.bad /tmp/show-me-all/$seq.out.bad-$var
	    else
		echo "Warning: can't find $seq.out.bad for $host"
	    fi
	    myvar=$var
	    echo "$var $sum" >>$tmp.map
	    var=`expr $var + 1`
	fi
	myhost=``
	if [ -f $HOME/whatami.out ]
	then
	    myhost=`grep "^$host " $HOME/whatami.out`
	    if [ -z "$myhost" ]
	    then
		cd $HOME/Logs/by-vm/$host
		cihost=`/bin/pwd | sed -e "s@.*/@@" | _map_ci_hostname`
		myhost=`grep "^$cihost " $HOME/whatami.out`
		cd $here
	    fi
	fi
	[ -z "$myhost" ] && myhost=$host
	printf " %2d   %s\n" $myvar "$myhost" >>$tmp.mail
    done
    echo "Subject: QA failures for qa/$seq (`grep "^$seq " $HOME/src/pcp/qa/group | sed \
-e "s/^$seq //" -e 's/ local//' -e 's/ remote//'`)"
    echo
    echo "QA test $seq is failing on a number of machines in the QA Farm and CI."
    echo
    echo "If you can help with diagnosis that would be most appreciated."
    echo
    echo "Even better would be code changes if this indicates there is a real"
    echo "bug or QA changes if it represents a QA test failure."
    echo
    numvar=`echo /tmp/show-me-all/$seq.out.bad-* | wc -w | sed -e 's/  */ /g'`
    if [ "$numvar" -gt 1 ]
    then
	echo "Details for qa/$seq failures."
	echo
	echo "bad-  Host                PCP      CPU     Operating System"
	sort -k1,1n -k2,2 <$tmp.mail
	echo
	echo "The $numvar variants of the $seq.out.bad file are attached."
	echo
	echo "Attachments: `echo /tmp/show-me-all/$seq.out.bad-*`"
    else
	echo "The failure is the same on all the following hosts."
	echo
	echo "Host                PCP      CPU     Operating System"
	sort -k1,1n -k2,2 <$tmp.mail | sed -e 's/^......//'
	echo
	echo "The $seq.out.bad file is attached."
	echo
	mv /tmp/show-me-all/$seq.out.bad-0 /tmp/show-me-all/$seq.out.bad
	echo "Attachment: /tmp/show-me-all/$seq.out.bad"
    fi
fi

sts=0
