X-Git-Url: http://dxcluster.net/gitweb/gitweb.cgi?a=blobdiff_plain;f=perl%2FBadWords.pm;h=141b3e9a729b6a933149cb0bf71bf4a942938285;hb=b06fd1d447d5411d8d441e807f93efa897b68aaf;hp=36db8ffba44384d76a3bc8acbc9789d0447cbbef;hpb=888290f339e2ee00894445fecb14f0b506d12368;p=spider.git diff --git a/perl/BadWords.pm b/perl/BadWords.pm index 36db8ffb..141b3e9a 100644 --- a/perl/BadWords.pm +++ b/perl/BadWords.pm @@ -3,7 +3,7 @@ # # Copyright (c) 2000 Dirk Koopman # -# $Id$ +# # package BadWords; @@ -17,24 +17,18 @@ use DXDebug; use IO::File; -use vars qw($badword @regex); +use vars qw($badword $regexcode); -my $oldfn = "$main::data/badwords"; -my $regex = "$main::data/badw_regex"; -my $bwfn = "$main::data/badword"; +my $oldfn = localdata("badwords"); +my $regex = localdata("badw_regex"); +my $bwfn = localdata("badword"); # copy issue ones across -filecopy("$regex.issue", $regex) unless -e $regex; +filecopy("$regex.gb.issue", $regex) unless -e $regex; filecopy("$bwfn.issue", $bwfn) unless -e $bwfn; $badword = new DXHash "badword"; -use vars qw($VERSION $BRANCH); -$VERSION = sprintf( "%d.%03d", q$Revision$ =~ /(\d+)\.(\d+)/ ); -$BRANCH = sprintf( "%d.%03d", q$Revision$ =~ /\d+\.\d+\.(\d+)\.(\d+)/ ) || 0; -$main::build += $VERSION; -$main::branch += $BRANCH; - # load the badwords file sub load { @@ -61,11 +55,10 @@ sub load sub create_regex { my @out; - @regex = (); - my $fh = new IO::File $regex; if ($fh) { + my $s = "sub { my \$str = shift; my \@out; \n"; while (<$fh>) { chomp; next if /^\s*\#/; @@ -74,13 +67,19 @@ sub create_regex # create a closure for each word so that it matches stuff with spaces/punctuation # and repeated characters in it my $w = uc $_; - my @l = map { $_ eq 'I' ? '[I1]' : ($_ eq 'O' ? '[O0]' : $_) }split //, $w; - my $e = join '+[\s\W]+', @l; - my $s = eval qq{sub { return \$_[0] =~ /$e+/ ? '$w' : () } }; - push @regex, $s unless $@; - dbg("create_regex: $@") if $@; + my @l = split //, $w; + my $e = join '+[\s\W]*', @l; + $s .= "push \@out, \$1 if \$str =~ /\\b($e)/;\n"; } } + $s .= "return \@out;\n}"; + $regexcode = eval $s; + dbg($s) if isdbg('badword'); + if ($@) { + @out = ($@); + dbg($@); + return @out; + } $fh->close; } else { my $l = "can't open $regex $!"; @@ -96,17 +95,12 @@ sub check { my $s = uc shift; my @out; - - for (@regex) { - push @out, &$_($s); - } + + push @out, &$regexcode($s) if $regexcode; return @out if @out; - for (split(/\s+/, $s)) { - s/[^\w]//g; - push @out, $_ if $badword->in($_); - s/\'?S$//; + for (split(/\b/, $s)) { push @out, $_ if $badword->in($_); }