X-Git-Url: http://dxcluster.net/gitweb/gitweb.cgi?a=blobdiff_plain;f=perl%2FBadWords.pm;h=312a04082c705fac550f9b0fcb065388cca2e564;hb=6ec22e78a4a344ce675645fabf18b2a1971f364a;hp=e7d1169e3a390abd5845f5922bb299eaa79cefdc;hpb=17f0b57add792391822d38116e89b33c1df4e2dd;p=spider.git diff --git a/perl/BadWords.pm b/perl/BadWords.pm index e7d1169e..312a0408 100644 --- a/perl/BadWords.pm +++ b/perl/BadWords.pm @@ -3,7 +3,7 @@ # # Copyright (c) 2000 Dirk Koopman # -# $Id$ +# # package BadWords; @@ -13,50 +13,84 @@ use strict; use DXUtil; use DXVars; use DXHash; -use IO::File; +use DXDebug; -use vars qw($badword); +use IO::File; -my $oldfn = "$main::data/badwords"; -$badword = new DXHash "badword"; +use vars qw($badword $regexcode); -use vars qw($VERSION $BRANCH); -$VERSION = sprintf( "%d.%03d", q$Revision$ =~ /(\d+)\.(\d+)/ ); -$BRANCH = sprintf( "%d.%03d", q$Revision$ =~ /\d+\.\d+\.(\d+)\.(\d+)/ ) || 0; -$main::build += $VERSION; -$main::branch += $BRANCH; +our $regex; # load the badwords file sub load { + my $bwfn = localdata("badword"); + filecopy("$main::data.issue", $bwfn) unless -e $bwfn; + my @out; - return unless -e $oldfn; - my $fh = new IO::File $oldfn; + + $badword = new DXHash "badword"; + + push @out, create_regex(); + return @out; +} + +sub create_regex +{ + $regex = localdata("badw_regex"); + filecopy("$regex.gb.issue", $regex) unless -e $regex; + + my @out; + my $fh = new IO::File $regex; if ($fh) { + my $s = "sub { my \$str = shift; my \@out; \n"; while (<$fh>) { chomp; next if /^\s*\#/; my @list = split " "; for (@list) { - $badword->add($_); + # create a closure for each word so that it matches stuff with spaces/punctuation + # and repeated characters in it + my $w = uc $_; + my @l = split //, $w; + my $e = join '+[\s\W]*', @l; + $s .= qq{push \@out, \$1 if \$str =~ m|\\b($e+)|;\n}; } } + $s .= "return \@out;\n}"; + $regexcode = eval $s; + dbg($s) if isdbg('badword'); + if ($@) { + @out = ($@); + dbg($@); + return @out; + } $fh->close; - $badword->put; - unlink $oldfn; } else { - my $l = "can't open $oldfn $!"; + my $l = "can't open $regex $!"; dbg($l); push @out, $l; } + return @out; } # check the text against the badwords list sub check { - return grep { $badword->in($_) } split(/\b/, lc shift); + my $s = uc shift; + my @out; + + push @out, &$regexcode($s) if $regexcode; + + return @out if @out; + + for (split(/\b/, $s)) { + push @out, $_ if $badword->in($_); + } + + return @out; } 1;