#!/usr/bin/perl -wT # # $Id: unseen,v 1.10 2003/08/01 02:10:57 jmates Exp $ # # The author disclaims all copyrights and releases this script into the # public domain. # # Lists new (unseen) files found under the specified directories. require 5; use strict; sub filedigest; sub duration2seconds; sub remark; sub print_help; sub untaintpath; use Digest::SHA1 (); my $ctx = Digest::SHA1->new; use File::Basename; use File::Find; use Time::Local; use YAML (); use Getopt::Std; my %opts; getopts 'h?s:w:T', \%opts; print_help and exit 1 if exists $opts{'h'} or exists $opts{'?'} or not @ARGV; # sanitize search directories specified my @dirs = untaintpath @ARGV; remark 'alert', "untaint on search directories failed" and exit 1 unless @dirs; remark 'warning', "removed " . (scalar @ARGV - scalar @dirs) . " tainted search directory from argument list" if @dirs != @ARGV; # find date ranges for new files to skip (-T "today") or files older # than allowed by the -w "window" option my $midnight = Time::Local::timelocal(qw(0 0 0), (localtime)[3..5]); my $window = ($midnight||$^T) - duration2seconds $opts{'w'} || '2w'; undef $midnight unless exists $opts{'T'}; # load and prepare seen index my $seen_file; if (exists $opts{'s'}) { $seen_file = untaintpath $opts{'s'} or remark 'alert', "could not untaint seen filename" and exit 1; } my $seen; if ($seen_file and -f $seen_file) { $seen = YAML::LoadFile $seen_file; # throw out older entries from seen data for my $key (keys %$seen) { delete $seen->{$key} if $seen->{$key}->{mtime} < $window; } } # look for files find {wanted => \&process, no_chdir => 1}, @dirs; # update seen index YAML::DumpFile $seen_file, $seen if $seen_file; exit; # File::Find file handler sub process { return unless -f; my %filedata; # skip logfiles outside of processing window by modification time $filedata{mtime} = (stat _)[9]; return if $filedata{mtime} < $window; # skip files modified today if needed if ($midnight) { return if $filedata{mtime} > $midnight; } # untaint filename (paranoia) $filedata{filename} = untaintpath $_ or remark 'alert', "could not untaint logfile name: $_" and exit 1; # skip files with no digest, or a digest already seen $filedata{digest} = filedigest $filedata{filename}; return unless exists $filedata{digest}; return if exists $seen->{$filedata{digest}}; # various path information for future processing ($filedata{name}, $filedata{parent}) = fileparse $_; # just print full filename print $filedata{filename}, "\n"; # update seen record with selected file data $seen->{$filedata{digest}} = {map { $_ => $filedata{$_} } qw(filename mtime)}; } # untaints data, restricts what filenames may contain. Accepts 1 or more # file paths, returns undef or 1 or more paths depending on the results # of the tests. sub untaintpath { my @paths = grep { defined } map { # characters allows in filenames, paths m,^([A-Za-z0-9_./-]+)$,; # path good only if exists and same as untainted version $_ = (defined $1 and $_ eq $1) ? $1 : undef; # KLUGE other attacks make use of ../../../ runs, though such might be # legitimate (though discouraged) means of locating files $_ = (defined and m,\Q../..,) ? undef: $_; } @_; return wantarray ? @paths : $paths[0]; } # takes filename, returns sha1 digest of file contents using # global object sub filedigest { my $file = shift; open FILE, $file or remark 'alert', "problem opening $file: $!" and exit 1; binmode FILE; # b64digest method resets object, so can reuse my $digest = $ctx->addfile(*FILE)->b64digest; close FILE; return $digest; } # for logging things sub remark { my ($facility, $priority, $message); $facility = 'user'; if (@_ > 1) { $priority = shift; $message = "@_"; } else { $priority = 'info'; $message = "@_"; } chomp $message; warn $priority, ": ", $message, "\n"; return 1; } # takes duration such as "2m3s" and returns number of seconds. sub duration2seconds { my $tmpdur = shift; my $seconds; # how to convert short human durations into seconds my %factor = ( w => 604800, d => 86400, h => 3600, m => 60, s => 1, ); # assume raw seconds for plain number if ($tmpdur =~ m/^\d+$/) { $seconds = $tmpdur * 60; } elsif ($tmpdur =~ m/^[wdhms\d\s]+$/) { # match "2m 5s" style input and convert to seconds while ($tmpdur =~ m/(\d+)\s*([wdhms])/g) { $seconds += $1 * $factor{$2}; } } else { remark 'alert', 'unknown characters in duration'; exit 1; } unless (defined $seconds and $seconds =~ m/^\d+$/) { remark 'alert', 'unabled to parse duration'; exit 1; } return $seconds; } # clean up env for taint mode ("perldoc perlsec" for more information) sub BEGIN { delete @ENV{qw(IFS CDPATH ENV BASH_ENV)}; $ENV{'PATH'} = '/bin:/usr/bin'; } # a generic help blarb sub print_help { print <<"HELP"; Usage: $0 [options] searchdir1 [sd2 .. sdN] Lists new (unseen) files under specified directory(-ies). Options supported: -h/-? Display this message. -s ss Path to list of previously seen filenames index. -w dd Window inside which files will be considered against index. -T Skip files modified today. Run perldoc(1) on this script for additional documentation. HELP } __END__ ###################################################################### # # DOCUMENTATION =head1 NAME unseen - lists unseen files under specified directories =head1 SYNOPSIS List unseen files under /var/log/archive new within the last three weeks, using /var/log/archive/seen as seen archive file. $ unseen -w 3w -s /var/log/archive/seen /var/log/archive =head1 DESCRIPTION =head2 Overview For listing unseen files under directory trees. Written to handle case of finding unprocessed logfiles (limited numbers of files) under log directories for passing to swatch and other utilities. Files are recorded as seen with aid of a seen file, which is indexed by the SHA1 digest of the file contents. A processing window option configures how far back in time the script is allowed to report on, to keep the index file and SHA1 digest generation needs from growing over time. As a consequence of the SHA1 digest, the script will list both files that are new to the search directory, or ones that have been modified since the previous run of the script. Without the seen index file, the script will always list all files under the specified search directories. =head2 Normal Usage $ unseen [options] searchdir1 [sd2 .. sdN] See L<"OPTIONS"> for details on the command line switches supported. Either a single or multiple search directories can be specified; directories will be searched recursively for regular files. =head1 OPTIONS This script currently supports the following command line switches: =over 4 =item B<-h>, B<-?> Prints a brief usage note about the script. =item B<-s> I Path to load seen file data from, or save to after listing the unseen files for the run in question. =item B<-w> I Prevents files older than midnight minus I from being considered with SHA1 checksum comparisons against the seen file index. The duraction can either be in raw seconds or a short-hand "2m5s" format. The default is to skip files older than two weeks. Should the script fail to calculate midnight, the script launch time will be used. =itme B<-T> Skip files modified after midnight (00:00) of current day. Good for excluding active logfiles, assuming daily rotation. =back =head1 SECURITY Taint mode is enabled by default. The script will die if the I or supplied directories fail an untaint check. =head1 BUGS =head2 Reporting Bugs Newer versions of this script may be available from: http://sial.org/code/perl/ If the bug is in the latest version, send a report to the author. Patches that fix problems or add new features are welcome. =head2 Known Issues No known bugs. =head1 SEE ALSO perl(1) =head1 AUTHOR Jeremy Mates, http://sial.org/contact/ =head1 COPYRIGHT The author disclaims all copyrights and releases this script into the public domain. =head1 VERSION $Id: unseen,v 1.10 2003/08/01 02:10:57 jmates Exp $ =head1 SCRIPT CATEGORIES Utilities UNIX/System_administration =cut