projects
/
spider.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
fix badword, ipv6 address detect, add CTY
[spider.git]
/
perl
/
BadWords.pm
diff --git
a/perl/BadWords.pm
b/perl/BadWords.pm
index e7d1169e3a390abd5845f5922bb299eaa79cefdc..312a04082c705fac550f9b0fcb065388cca2e564 100644
(file)
--- a/
perl/BadWords.pm
+++ b/
perl/BadWords.pm
@@
-3,7
+3,7
@@
#
# Copyright (c) 2000 Dirk Koopman
#
#
# Copyright (c) 2000 Dirk Koopman
#
-#
$Id$
+#
#
package BadWords;
#
package BadWords;
@@
-13,50
+13,84
@@
use strict;
use DXUtil;
use DXVars;
use DXHash;
use DXUtil;
use DXVars;
use DXHash;
-use
IO::File
;
+use
DXDebug
;
-use
vars qw($badword)
;
+use
IO::File
;
-my $oldfn = "$main::data/badwords";
-$badword = new DXHash "badword";
+use vars qw($badword $regexcode);
-use vars qw($VERSION $BRANCH);
-$VERSION = sprintf( "%d.%03d", q$Revision$ =~ /(\d+)\.(\d+)/ );
-$BRANCH = sprintf( "%d.%03d", q$Revision$ =~ /\d+\.\d+\.(\d+)\.(\d+)/ ) || 0;
-$main::build += $VERSION;
-$main::branch += $BRANCH;
+our $regex;
# load the badwords file
sub load
{
# load the badwords file
sub load
{
+ my $bwfn = localdata("badword");
+ filecopy("$main::data.issue", $bwfn) unless -e $bwfn;
+
my @out;
my @out;
- return unless -e $oldfn;
- my $fh = new IO::File $oldfn;
+
+ $badword = new DXHash "badword";
+
+ push @out, create_regex();
+ return @out;
+}
+
+sub create_regex
+{
+ $regex = localdata("badw_regex");
+ filecopy("$regex.gb.issue", $regex) unless -e $regex;
+
+ my @out;
+ my $fh = new IO::File $regex;
if ($fh) {
if ($fh) {
+ my $s = "sub { my \$str = shift; my \@out; \n";
while (<$fh>) {
chomp;
next if /^\s*\#/;
my @list = split " ";
for (@list) {
while (<$fh>) {
chomp;
next if /^\s*\#/;
my @list = split " ";
for (@list) {
- $badword->add($_);
+ # create a closure for each word so that it matches stuff with spaces/punctuation
+ # and repeated characters in it
+ my $w = uc $_;
+ my @l = split //, $w;
+ my $e = join '+[\s\W]*', @l;
+ $s .= qq{push \@out, \$1 if \$str =~ m|\\b($e+)|;\n};
}
}
}
}
+ $s .= "return \@out;\n}";
+ $regexcode = eval $s;
+ dbg($s) if isdbg('badword');
+ if ($@) {
+ @out = ($@);
+ dbg($@);
+ return @out;
+ }
$fh->close;
$fh->close;
- $badword->put;
- unlink $oldfn;
} else {
} else {
- my $l = "can't open $
oldfn
$!";
+ my $l = "can't open $
regex
$!";
dbg($l);
push @out, $l;
}
dbg($l);
push @out, $l;
}
+
return @out;
}
# check the text against the badwords list
sub check
{
return @out;
}
# check the text against the badwords list
sub check
{
- return grep { $badword->in($_) } split(/\b/, lc shift);
+ my $s = uc shift;
+ my @out;
+
+ push @out, &$regexcode($s) if $regexcode;
+
+ return @out if @out;
+
+ for (split(/\b/, $s)) {
+ push @out, $_ if $badword->in($_);
+ }
+
+ return @out;
}
1;
}
1;