From: minima <minima>
Date: Thu, 4 Oct 2001 13:53:47 +0000 (+0000)
Subject: 7. improved the regex matching of badwords (more efficient, better coverage)
X-Git-Tag: R_1_49~67
X-Git-Url: http://dxcluster.net/gitweb/gitweb.cgi?a=commitdiff_plain;h=888290f339e2ee00894445fecb14f0b506d12368;p=spider.git

7. improved the regex matching of badwords (more efficient, better coverage)
8. added default badword and badw_regex tables (as .issue files) which will
activate unless there is one there already.
---

diff --git a/Changes b/Changes
index 0890d5ed..57218b5a 100644
--- a/Changes
+++ b/Changes
@@ -6,6 +6,9 @@ last port of call).
 4. store deleted status across restarts!
 5. make callsign checking more rigorous
 6. dup check PC49 (kill full)
+7. improved the regex matching of badwords (more efficient, better coverage)
+8. added default badword and badw_regex tables (as .issue files) which will
+activate unless there is one there already.
 03Oct01=======================================================================
 1. don't allow @WWW to become a 'TO' field...
 2. handle @gb7tlh.#35.eu type addresses as well
diff --git a/perl/BadWords.pm b/perl/BadWords.pm
index 9814e3fa..36db8ffb 100644
--- a/perl/BadWords.pm
+++ b/perl/BadWords.pm
@@ -13,11 +13,20 @@ use strict;
 use DXUtil;
 use DXVars;
 use DXHash;
+use DXDebug;
+
 use IO::File;
 
-use vars qw($badword);
+use vars qw($badword @regex);
 
 my $oldfn = "$main::data/badwords";
+my $regex = "$main::data/badw_regex";
+my $bwfn = "$main::data/badword";
+
+# copy issue ones across
+filecopy("$regex.issue", $regex) unless -e $regex;
+filecopy("$bwfn.issue", $bwfn) unless -e $bwfn;
+
 $badword = new DXHash "badword";
 
 use vars qw($VERSION $BRANCH);
@@ -30,7 +39,6 @@ $main::branch += $BRANCH;
 sub load
 {
 	my @out;
-	return unless -e $oldfn;
 	my $fh = new IO::File $oldfn;
 	
 	if ($fh) {
@@ -45,11 +53,41 @@ sub load
 		$fh->close;
 		$badword->put;
 		unlink $oldfn;
+	}
+	push @out, create_regex(); 
+	return @out;
+}
+
+sub create_regex
+{
+	my @out;
+	@regex = ();
+	
+	my $fh = new IO::File $regex;
+	
+	if ($fh) {
+		while (<$fh>) {
+			chomp;
+			next if /^\s*\#/;
+			my @list = split " ";
+			for (@list) {
+				# create a closure for each word so that it matches stuff with spaces/punctuation
+				# and repeated characters in it
+				my $w = uc $_;
+				my @l = map { $_ eq 'I' ? '[I1]' : ($_ eq 'O' ? '[O0]' : $_) }split //, $w;
+				my $e = join '+[\s\W]+', @l;
+				my $s = eval qq{sub { return \$_[0] =~ /$e+/ ? '$w' : () } };
+				push @regex, $s unless $@;
+				dbg("create_regex: $@") if $@;
+			}
+		}
+		$fh->close;
 	} else {
-		my $l = "can't open $oldfn $!";
+		my $l = "can't open $regex $!";
 		dbg($l);
 		push @out, $l;
 	}
+	
 	return @out;
 }
 
@@ -57,32 +95,22 @@ sub load
 sub check
 {
 	my $s = uc shift;
+	my @out;
+	
+	for (@regex) {
+		push @out, &$_($s);
+	}
+	
+	return @out if @out;
 	
 	for (split(/\s+/, $s)) {
 		s/[^\w]//g;
-		return $_ if $badword->in($_);
+		push @out, $_ if $badword->in($_);
 		s/\'?S$//;
-		return $_ if $badword->in($_);
-	}
-	
-	# look for a few of the common ones with spaces and stuff
-	if ($s =~ /F[\s\W]*U[\s\W]*C[\s\W]*K/) {
-		return "FUCK";
-	} elsif ($s =~ /C[\s\W]*U[\s\W]*N[\s\W]*T/) {
-		return "CUNT";
-	} elsif ($s =~ /W[\s\W]*A[\s\W]*N[\s\W]*K/) {
-		return "WANK";
-	} elsif ($s =~ /C[\s\W]*[0O][\s\W]*C[\s\W]*K/) {
-		return "COCK";
-	} elsif ($s =~ /S[\s\W]*H[\s\W]*[I1][\s\W]*T/) {
-		return "SHIT";
-	} elsif ($s =~ /P[\s\W]*[I1][\s\W]*S[\s\W]*S/) {
-		return "PISS";
-	} elsif ($s =~ /B[\s\W]*[O0][\s\W]*L[\s\W]*L[\s\W]*[O0][\s\W]*[CK]/) {
-		return "BOLLOCKS";
+		push @out, $_ if $badword->in($_);
 	}
-	
-	return ();
+
+	return @out;
 }
 
 1;
diff --git a/perl/DXUtil.pm b/perl/DXUtil.pm
index 018a404a..42467548 100644
--- a/perl/DXUtil.pm
+++ b/perl/DXUtil.pm
@@ -10,6 +10,7 @@ package DXUtil;
 
 use Date::Parse;
 use IO::File;
+use File::Copy;
 use Data::Dumper;
 
 use strict;
@@ -26,6 +27,7 @@ require Exporter;
 @ISA = qw(Exporter);
 @EXPORT = qw(atime ztime cldate cldatetime slat slong yesno promptf 
 			 parray parraypairs phex shellregex readfilestr writefilestr
+			 filecopy
              print_all_fields cltounix unpad is_callsign is_latlong
 			 is_qra is_freq is_digits is_pctext is_pcflag insertitem deleteitem
             );
@@ -321,6 +323,11 @@ sub writefilestr
 	}
 }
 
+sub filecopy
+{
+	copy(@_) or return $!;
+}
+
 # remove leading and trailing spaces from an input string
 sub unpad
 {