#!/usr/bin/perl -w
#
# spamscan
# ========
# The spamscan filter is used by logwatch to analyse messages
# output by spamd in order to evaluate local rule performance. 
# 
# This filter is implemented as a perl script. It outputs 
# statistics for good messages vs. spam, followed by a usage 
# analysis of rules declared in /etc/mail/spamassassin.local.cf
#
# Copyright 2009, Martin Gregorie.
#
# Spamscan is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with the script. If not, see <http://www.gnu.org/licenses/>.
#              
# Parse and validate the command line options
# ===========================================
# Set defaults
#

$help = 0;
$alpha = 0;
$topdown = 0;
$summary = 1;
$unused = 0;
$localrules = "/etc/mail/spamassassin/local.cf";

#
# Parse the command line
#
foreach $arg (@ARGV)
{
   if ($arg eq "-?")
   {
      $help = 1;
   }
   elsif ($arg eq "-alpha")
   {
      $alpha = 1;
   }
   elsif ($arg eq "-nosummary")
   {
      $summary = 0;
   }
   elsif ($arg eq "-topdown")
   {
      $topdown = 1;
   }
   elsif ($arg eq "-unused")
   {
      $unused = 1;
   }
   else
   {
      printf("Error: unknown option %s\n", $arg);
      exit;
   }
}

#
# Display help if requested and exit.
#
if ($help)
{
   $prgnam = $0;
   $prgnam =~ s/^.*\///;
   printf("\n");
   printf("Syntax:   $prgnam [options]\n");
   printf("Function: Report local Spamassassin rule usage.\n");
   printf("          Input is read from STDIN.\n");
   printf("          Spam detection statistics are always shown.\n");
   printf("          Rules statistics default to the top 10 summary.\n");
   printf("\n");
   printf("Options:  -alpha      list rules alphabetically\n");
   printf("          -nosummary  suppresses the 10 most used rules\n");
   printf("          -topdown    lists most used rules first\n");
   printf("          -unused     lists unused rules\n");
   printf("\n");
   printf("          Options -alpha and -topdown cancel the top 10\n");
   printf("          and unused lists.\n");
   exit;
}

#
# Check that we have a valid set of options,
# exiting if errors are found.
# 
if ($alpha + $topdown > 1)
{
   printf("Error: an invalid option combination was selected\n");
   printf("       -alpha and -topdown are mutually exclusive.\n");
   exit;
}

#
# Extract a list of locally defined rules from the Spamassassin
# local configuration file.
#
open(CONF, $localrules) or die "Error: can't open $localrules";
while ($cl = <CONF>)
{
   if ($cl =~ /^describe/i)
   {
      @words = split(/\s+/, $cl);
      $name = $words[1];
      $hits{$name} = 0;
   }
}

#
# Scan the maillog(s)
# ===================
# One pass collects all spam-related data and prints spamkiller errors.
#
$ham = 0;
$spam = 0;
while ($line = <STDIN>)
{
   if ($line =~ /spamd.*result/)
   {
      #
      # Collect spam data from the spamd result reporting line
      #
      @fld = split(/\s/, $line);
      for ($i = 0; $i < @fld; $i++)
      {
         last if $fld[$i] eq "-";
      }
          
      $spamflag = $fld[$i - 2];
      if ($spamflag eq "Y")
      {
         $spam++;
      }
      else
      {
         $ham++;
      }

      @rules = split(/,/, $fld[$i + 1]);
      foreach $name (@rules)
      {
         if (exists($hits{$name}))
         {
            $n = $hits{$name};
            $hits{$name} = ++$n;
         }
      }
   }
}

#
# All data has now been collected: output the required data
# =========================================================
# Spam statistics are always output. Spamkiller ham/spam 
# counts take precedence if spamkiller was running. Otherwise
# ham/spam counts are taken from the spamd result line. Rule 
# hits always come from the spamd result line. 
#
print("\n");
$tot = $ham + $spam;
$perc = 0;
$perc = (100.0 * $ham ) / (1.0 * $tot) + 0.5 if $tot > 0;
printf("Accepted messages: %4d (%d%%)\n", $ham, $perc);
printf("Spam             : %4d\n", $spam);
printf("Total            : %4d\n", $tot);
print("\n");

#
# Rules statistics are selected by setting command line options
# 
if ($alpha)
{
   #
   # Report rule activity sorted by rule name
   #
   $fh = "%-18s  %5s\n";
   $fl = "%-18s  %5d\n";
   printf($fh, "Rule name", "Hits");
   printf($fh, "=========", "====");
   $hitcount = 0;
   foreach $rule (sort keys %hits)
   {
      $n = $hits{$rule};
      printf($fl, $rule, $n);
      $hitcount++;
   }
   
   printf("\n%d rules listed\n", $hitcount);
}
elsif ($topdown)
{
   #
   # Report rule activity, most used rule first
   #
   $fh = "%5s  %-18s\n";
   $fl = "%5d  %-18s\n";
   printf($fh, "Hits", "Rule name");
   printf($fh, "====", "=========");
   $hitcount = 0;
   foreach $rule (sort { $hits{$b} <=> $hits{$a} } keys %hits)
   {
      $n = $hits{$rule};
      printf($fl, $n, $rule);
      $hitcount++;
   }
   
   printf("\n%d rules listed\n", $hitcount);
}
elsif ($summary || $unused)
{
   #
   # Report top 10 rules and unused rules
   #
   $fh = "%-18s  %5s\n";
   $ft = "%-18s  %5d\n";
   $fu = "%-18s\n";
   $hitcount = 0;
   $topcount = 0;
   $unusedcount = 0;
   foreach $rule (sort { $hits{$b} <=> $hits{$a} } keys %hits)
   {
      #
      # This loop reports the top 10 rules (these are the first 10)
      # and builds a list of unused rules.
      #
      $n = $hits{$rule};
      if ($summary && $topcount < 10 && $n > 0)
      {
         if ($topcount == 0)
      	 {
            printf($fh, "Top scoring rules", "Hits");
            printf($fh, "=================", "====");
      	 }

         printf($ft, $rule, $n);
         $topcount++;
      }

      if ($unused && $n < 1)
      {
      	 push(@unused, $rule);
      }
   }

   if ($unused)
   {
      #
      # If unused rules are to be reported, they are sorted
      # alphabetically and then output.
      #
      foreach $rule (sort @unused)
      {
         if ($unusedcount == 0)
      	 {
      	    printf("\n") if $topcount;
            printf($fu, "Unused rules");
            printf($fu, "============");
      	 }

         printf($fu, $rule);
         $unusedcount++;
      }

      $r = ($unusedcount == 1 ? "rule" : "rules");
      printf("\n%d $r did not fire\n", $unusedcount) if $unusedcount;
   }
}
