From: minima Date: Thu, 4 Oct 2001 13:53:47 +0000 (+0000) Subject: 7. improved the regex matching of badwords (more efficient, better coverage) X-Git-Tag: R_1_49~67 X-Git-Url: http://dxcluster.net/gitweb/gitweb.cgi?a=commitdiff_plain;h=888290f339e2ee00894445fecb14f0b506d12368;p=spider.git 7. improved the regex matching of badwords (more efficient, better coverage) 8. added default badword and badw_regex tables (as .issue files) which will activate unless there is one there already. --- diff --git a/Changes b/Changes index 0890d5ed..57218b5a 100644 --- a/Changes +++ b/Changes @@ -6,6 +6,9 @@ last port of call). 4. store deleted status across restarts! 5. make callsign checking more rigorous 6. dup check PC49 (kill full) +7. improved the regex matching of badwords (more efficient, better coverage) +8. added default badword and badw_regex tables (as .issue files) which will +activate unless there is one there already. 03Oct01======================================================================= 1. don't allow @WWW to become a 'TO' field... 2. handle @gb7tlh.#35.eu type addresses as well diff --git a/perl/BadWords.pm b/perl/BadWords.pm index 9814e3fa..36db8ffb 100644 --- a/perl/BadWords.pm +++ b/perl/BadWords.pm @@ -13,11 +13,20 @@ use strict; use DXUtil; use DXVars; use DXHash; +use DXDebug; + use IO::File; -use vars qw($badword); +use vars qw($badword @regex); my $oldfn = "$main::data/badwords"; +my $regex = "$main::data/badw_regex"; +my $bwfn = "$main::data/badword"; + +# copy issue ones across +filecopy("$regex.issue", $regex) unless -e $regex; +filecopy("$bwfn.issue", $bwfn) unless -e $bwfn; + $badword = new DXHash "badword"; use vars qw($VERSION $BRANCH); @@ -30,7 +39,6 @@ $main::branch += $BRANCH; sub load { my @out; - return unless -e $oldfn; my $fh = new IO::File $oldfn; if ($fh) { @@ -45,11 +53,41 @@ sub load $fh->close; $badword->put; unlink $oldfn; + } + push @out, create_regex(); + return @out; +} + +sub create_regex +{ + my @out; + @regex = (); + + my $fh = new IO::File $regex; + + if ($fh) { + while (<$fh>) { + chomp; + next if /^\s*\#/; + my @list = split " "; + for (@list) { + # create a closure for each word so that it matches stuff with spaces/punctuation + # and repeated characters in it + my $w = uc $_; + my @l = map { $_ eq 'I' ? '[I1]' : ($_ eq 'O' ? '[O0]' : $_) }split //, $w; + my $e = join '+[\s\W]+', @l; + my $s = eval qq{sub { return \$_[0] =~ /$e+/ ? '$w' : () } }; + push @regex, $s unless $@; + dbg("create_regex: $@") if $@; + } + } + $fh->close; } else { - my $l = "can't open $oldfn $!"; + my $l = "can't open $regex $!"; dbg($l); push @out, $l; } + return @out; } @@ -57,32 +95,22 @@ sub load sub check { my $s = uc shift; + my @out; + + for (@regex) { + push @out, &$_($s); + } + + return @out if @out; for (split(/\s+/, $s)) { s/[^\w]//g; - return $_ if $badword->in($_); + push @out, $_ if $badword->in($_); s/\'?S$//; - return $_ if $badword->in($_); - } - - # look for a few of the common ones with spaces and stuff - if ($s =~ /F[\s\W]*U[\s\W]*C[\s\W]*K/) { - return "FUCK"; - } elsif ($s =~ /C[\s\W]*U[\s\W]*N[\s\W]*T/) { - return "CUNT"; - } elsif ($s =~ /W[\s\W]*A[\s\W]*N[\s\W]*K/) { - return "WANK"; - } elsif ($s =~ /C[\s\W]*[0O][\s\W]*C[\s\W]*K/) { - return "COCK"; - } elsif ($s =~ /S[\s\W]*H[\s\W]*[I1][\s\W]*T/) { - return "SHIT"; - } elsif ($s =~ /P[\s\W]*[I1][\s\W]*S[\s\W]*S/) { - return "PISS"; - } elsif ($s =~ /B[\s\W]*[O0][\s\W]*L[\s\W]*L[\s\W]*[O0][\s\W]*[CK]/) { - return "BOLLOCKS"; + push @out, $_ if $badword->in($_); } - - return (); + + return @out; } 1; diff --git a/perl/DXUtil.pm b/perl/DXUtil.pm index 018a404a..42467548 100644 --- a/perl/DXUtil.pm +++ b/perl/DXUtil.pm @@ -10,6 +10,7 @@ package DXUtil; use Date::Parse; use IO::File; +use File::Copy; use Data::Dumper; use strict; @@ -26,6 +27,7 @@ require Exporter; @ISA = qw(Exporter); @EXPORT = qw(atime ztime cldate cldatetime slat slong yesno promptf parray parraypairs phex shellregex readfilestr writefilestr + filecopy print_all_fields cltounix unpad is_callsign is_latlong is_qra is_freq is_digits is_pctext is_pcflag insertitem deleteitem ); @@ -321,6 +323,11 @@ sub writefilestr } } +sub filecopy +{ + copy(@_) or return $!; +} + # remove leading and trailing spaces from an input string sub unpad {