#!/usr/bin/perl -w # This script will reduce the size of awstats data files by removing # IP addresses, URLs, etc. that only appear once or twice. # By Han-Kwang Nienhuys, http://www.lagom.nl/, 2007 # I donate this to the public domain. # keys: section name # value: fieldno (0..),minvalue %thresholds = ( "SEARCHWORDS", "1,1.5", "KEYWORDS", "1,1.5", "VISITOR", "1,3.5", "SIDER", "1,1.5" ); # threshold scaling with original file size in bytes # this is fuzzy logic. For an awstats file with filesize equal to or smaller # than $threshold_scaling, use the minvalues above (e.g. VISITOR 3.5 means only IP addresses # with >=4 hits). # For larger files, the thresholds are increased proportionally. $threshold_scaling = 300000; $keep_backup = 0; # for debugging $mindays = 28; if ($#ARGV==-1 || $ARGV[0] =~ /^-h/) { print("Use: $0 [option] awstats*.txt\n". " for compacting awstats data files by removing hosts and search keywords with few hits\n". " if '# squish-awstat' appears in the file, the file is ignored.\n". " -older=num : only compress files that are older than num days (default: 28).\n"); exit; } if ($ARGV[0] =~ /^-older/) { if ($ARGV[0] =~ /-older=([0-9]+)$/) { $mindays = $1; shift(@ARGV); } else { print(STDERR "$ARGV[0]: syntax error\n"); exit(1); } } foreach $fn (@ARGV) { @fstats = stat($fn); # file eligible? if ($fstats[9] > time() - 86400*$mindays) { print("$fn: skipped, too recent.\n"); next; } $fresh = 1; open($fh, $fn) || die("$fn: $!\n"); while (<$fh>) { if (/^\# squish-awstat/) { $fresh = 0; last; } } if (!$fresh) { print("$fn: skipped, already squished.\n"); close($fh); next; } seek($fh, 0, 0); # rewind to beginning of file print("$fn: squishing...\n"); open($fh, $fn) || die("$fn: $!\n"); if ($keep_backup) { rename($fn, "$fn.bak") || die("moving $fn -> $fn.bak: $!\n"); open($ofh, ">$fn") || die("Writing $fn: $!\n"); } else { open($ofh, ">$fn.tmp") || die("$fn.tmp: $!\n"); } $current_section = undef; $current_fno = 0; $current_minval = 0; $current_parse_flag = 0; while (<$fh>) { if (/^BEGIN_([A-Z]+) /) { $current_section = $1; $current_parse_flag = defined($thresholds{$current_section}); if ($current_parse_flag) { ($current_fno, $current_minval) = split(/,/, $thresholds{$current_section}); # scale threshold with file size @f = stat($fh); $fsize = $f[7]; if ($fsize > $threshold_scaling) { $current_minval = int($current_minval*$fsize/$threshold_scaling); } print($ofh "# squish-awstat: threshold $current_minval\n"); } $current_num = 0; $current_nskip = 0; ($current_section eq "MAP") && next; print($ofh $_); next; } if (/^END_[A-Z]+$/) { ($current_section eq "MAP") && next; $current_section = undef; print($ofh $_); if ($current_parse_flag) { $current_parse_flag = 0; print($ofh "# squish-awstat: discarded $current_nskip of $current_num lines\n"); } next; } (defined($current_section) && $current_section eq "MAP") && next; if (!$current_parse_flag) { print($ofh $_); next; } # ok, now parsing and checking... ++$current_num; @f = split(" ", $_); if ($f[$current_fno] >= $current_minval) { print($ofh $_); } else { ++$current_nskip; } } close($fh); close($ofh); if (!$keep_backup) { unlink($fn) && rename("$fn.tmp", $fn); } }