#!/usr/bin/perl -w

# This script will reduce the size of awstats data files by removing 
# IP addresses, URLs, etc. that only appear once or twice.
# By Han-Kwang Nienhuys, http://www.lagom.nl/, 2007
# I donate this to the public domain.

# keys: section name
# value: fieldno (0..),minvalue
%thresholds = (
   "SEARCHWORDS", "1,1.5",
   "KEYWORDS", "1,1.5",
   "VISITOR", "1,3.5", 
   "SIDER", "1,1.5"
  );
# threshold scaling with original file size in bytes
# this is fuzzy logic. For an awstats file with filesize equal to or smaller
# than $threshold_scaling, use the minvalues above (e.g. VISITOR 3.5 means only IP addresses
# with >=4 hits).
# For larger files, the thresholds are increased proportionally.
$threshold_scaling = 300000;
$keep_backup = 0; # for debugging
$mindays = 28;


if ($#ARGV==-1 || $ARGV[0] =~ /^-h/) {
  print("Use: $0 [option] awstats*.txt\n".
    "  for compacting awstats data files by removing hosts and search keywords with few hits\n".
    "  if '# squish-awstat' appears in the file, the file is ignored.\n".
    "  -older=num : only compress files that are older than num days (default: 28).\n");
  exit;
}
if ($ARGV[0] =~ /^-older/) {
  if ($ARGV[0] =~ /-older=([0-9]+)$/) {
    $mindays = $1;
    shift(@ARGV);
  } else {
    print(STDERR "$ARGV[0]: syntax error\n");
    exit(1);
  }
}


foreach $fn (@ARGV) {
  @fstats = stat($fn);
  
  # file eligible?  
  if ($fstats[9] > time() - 86400*$mindays) {
    print("$fn: skipped, too recent.\n");
    next;
  }
  $fresh = 1;
  open($fh, $fn) || die("$fn: $!\n");
  while (<$fh>) {
    if (/^\# squish-awstat/) {
      $fresh = 0;
      last;
    }
  }
  if (!$fresh) {
    print("$fn: skipped, already squished.\n");
    close($fh);
    next;
  }
  seek($fh, 0, 0); # rewind to beginning of file
  print("$fn: squishing...\n");
  
  open($fh, $fn) || die("$fn: $!\n");
  if ($keep_backup) {
    rename($fn, "$fn.bak") || die("moving $fn -> $fn.bak: $!\n");
    open($ofh, ">$fn") || die("Writing $fn: $!\n");
  } else {
    open($ofh, ">$fn.tmp") || die("$fn.tmp: $!\n");
  }

  $current_section = undef;
  $current_fno = 0;
  $current_minval = 0;
  $current_parse_flag = 0;
  
  while (<$fh>) {
    if (/^BEGIN_([A-Z]+) /) {
      $current_section = $1;
      $current_parse_flag = defined($thresholds{$current_section});
      if ($current_parse_flag) {
	($current_fno, $current_minval) = 
	   split(/,/, $thresholds{$current_section});
	   # scale threshold with file size
    	   @f = stat($fh);
	   $fsize = $f[7];
	   if ($fsize > $threshold_scaling) {
	     $current_minval = int($current_minval*$fsize/$threshold_scaling);
	   }
	   print($ofh "# squish-awstat: threshold $current_minval\n");
  	}
      $current_num = 0;
      $current_nskip = 0;
      ($current_section eq "MAP") && next;
      print($ofh $_);
      next;
    }
    if (/^END_[A-Z]+$/) {
      ($current_section eq "MAP") && next;
      $current_section = undef;
      print($ofh $_);
      if ($current_parse_flag) {
	$current_parse_flag = 0;
	print($ofh 
	  "# squish-awstat: discarded $current_nskip of $current_num lines\n");
	}
      next;
    }
    (defined($current_section) && $current_section eq "MAP") && next;
    
    if (!$current_parse_flag) {
      print($ofh $_);
      next;
    }
    # ok, now parsing and checking...
    ++$current_num;
    @f = split(" ", $_);
    if ($f[$current_fno] >= $current_minval) {
      print($ofh $_);
    } else {
      ++$current_nskip;
    }
  }
  
  close($fh);
  close($ofh);
  if (!$keep_backup) {
    unlink($fn) && rename("$fn.tmp", $fn);
  }
}