From: Dirk Koopman Date: Wed, 23 Nov 2022 13:47:18 +0000 (+0000) Subject: The new BadWord, all regex, system X-Git-Url: http://dxcluster.net/gitweb/gitweb.cgi?a=commitdiff_plain;h=606b9690c7bb9e293cb4a03507f0280eea05d9ce;p=spider.git The new BadWord, all regex, system --- diff --git a/Changes b/Changes index 0c941419..0cf34afc 100644 --- a/Changes +++ b/Changes @@ -1,3 +1,13 @@ +23Nov22======================================================================= +1. The BadWord system has been rewritten. This change is pretty radical and + needs to be used with care as a word that is entered will be reduced to the + minimum sized string needed to match that word. + + This is effective a sysop command changeable version of the file badw_regex + but in a much more sysop friendly form. The (un)set/badword ... + commands now update the /spider/local_data/badword.new file in real time + without having to mess about with editing files and running load/badword. + load/badword still works, but you should now never need it. 19Nov22======================================================================= 1. "Fix" Badword detection in spots and announces etc. NOTE: setting $DXCommandmode::maxbadcount to 0 (default 3) will disable diff --git a/cmd/Aliases b/cmd/Aliases index 246e8ab7..2461854b 100644 --- a/cmd/Aliases +++ b/cmd/Aliases @@ -31,6 +31,7 @@ package CmdAlias; '^ann?o?u?n?c?e?/full', 'announce full', 'announce', '^ann?o?u?n?c?e?/sysop', 'announce sysop', 'announce', '^ann?o?u?n?c?e?/(.*)$', 'announce $1', 'announce', + '^add/badwo?r?d?$', 'set/badword $1', 'set/badword', ], 'b' => [ '^b$', 'bye', 'bye', @@ -40,6 +41,7 @@ package CmdAlias; '^cre?a?t?e?$', 'apropos create', 'apropos', ], 'd' => [ + '^dele?t?e?/badwo?r?d?$', 'unset/badword $1', 'unset/badword', '^dele?t?e?/fu', 'kill full', 'kill', '^dele?t?e?$', 'kill', 'kill', '^dir?e?c?t?o?r?y?/a\w*', 'directory all', 'directory', diff --git a/cmd/Commands_en.hlp b/cmd/Commands_en.hlp index ac67b14f..7f5f3dbb 100644 --- a/cmd/Commands_en.hlp +++ b/cmd/Commands_en.hlp @@ -1641,22 +1641,39 @@ Use with extreme care. This command may well be superceded by FILTERing. This command will also stop TALK and ANNOUNCE/FULL from any user marked as a BADSPOTTER. -=== 6^SET/BADWORD ..^Stop things with this word being propagated -=== 6^UNSET/BADWORD ..^Propagate things with this word again +=== 6^SET/BADWORD ..^Stop things like this word being propagated Setting a word as a 'badword' will prevent things like spots, announces or talks with this word in the the text part from going any further. They will not be displayed and they will not be sent onto other nodes. -The word must be written in full, no wild cards are allowed eg:- +This has changed its meaning from the master release. All words entered +are reduced to the minimum regex that will match words starting like +this one: + + set/badword annihilate + +will stop anything that starts with these words in the text +like this: + + annihilate annihilated + +but it will also stop things like this: - set/badword annihilate annihilated annihilation + anihilate annni11ihhh ii lllattt eee ddd -will stop anything with these words in the text. +A few common 'leet' substitutions are automatically matched: - unset/badword annihilated + b0ll0cks bo0lll0ccckks fr1iig -will allow text with this word again. +and so on + +=== 6^UNSET/BADWORD ..^Propagate things like this word again +This is the opposite of set/badword + + unset/badword fred + +will allow text with this word again (if it has been set as a bad word. === 0^SET/BEEP^Add a beep to DX and other messages on your terminal === 0^UNSET/BEEP^Stop beeps for DX and other messages on your terminal diff --git a/cmd/set/badword.pl b/cmd/set/badword.pl index 2879388f..751e4d44 100644 --- a/cmd/set/badword.pl +++ b/cmd/set/badword.pl @@ -9,6 +9,23 @@ my ($self, $line) = @_; return (1, $self->msg('e5')) if $self->remotecmd; # are we permitted? return (1, $self->msg('e5')) if $self->priv < 6; -$line = join(' ', map {s|[/-]\d+$||; $_} split(/\s+/, $line)); -return $BadWords::badword->set(8, $self->msg('e6'), $self, $line); - +my @words = split /\s+/, uc $line; +my @out; +my $count = 0; +foreach my $w (@words) { + my @in; + + if (@in = BadWords::check($w)) { + push @out, "BadWord $w already matched by '$in[0]', ignored"; + } else { + @in = BadWords::add_regex($w); + push @out, "BadWord $w added as '$in[0]'"; + $count++; + } +} +if ($count) { + BadWords::generate_regex(); + BadWords::put(); +} +return (1, @out); + diff --git a/cmd/show/badword.pl b/cmd/show/badword.pl index a8ef5c57..947f70a4 100644 --- a/cmd/show/badword.pl +++ b/cmd/show/badword.pl @@ -9,5 +9,30 @@ my ($self, $line) = @_; return (1, $self->msg('e5')) if $self->remotecmd; # are we permitted? return (1, $self->msg('e5')) if $self->priv < 6; -return $BadWords::badword->show(1, $self); +my @out; +my @l; +my $count = 0; + +if ($line =~ /^\s*full/i) { + foreach my $w (BadWords::list_regex(1)) { + ++$count; + push @out, $w; + } +} +else { + foreach my $w (BadWords::list_regex()) { + ++$count; + if (@l >= 5) { + push @out, sprintf "%-12s %-12s %-12s %-12s %-12s", @l; + @l = (); + } + push @l, $w; + } + push @l, "" while @l < 5; + push @out, sprintf "%-12s %-12s %-12s %-12s %-12s", @l; +} + +push @out, "$count BadWords"; + +return (1, @out); diff --git a/cmd/unset/badword.pl b/cmd/unset/badword.pl index d2b7fe31..6d49de3a 100644 --- a/cmd/unset/badword.pl +++ b/cmd/unset/badword.pl @@ -9,6 +9,24 @@ my ($self, $line) = @_; return (1, $self->msg('e5')) if $self->remotecmd; # are we permitted? return (1, $self->msg('e5')) if $self->priv < 6; -$line = join(' ', map {s|[/-]\d+$||; $_} split(/\s+/, $line)); -return $BadWords::badword->unset(8, $self->msg('e6'), $self, $line); + +my @words = split /\s+/, uc $line; +my @out; +my $count = 0; +foreach my $w (@words) { + my @in; + + unless (@in = BadWords::check($w)) { + push @out, "BadWord $w not defined, ignored"; + } else { + @in = BadWords::del_regex($w); + push @out, "BadWord $w removed"; + $count++; + } +} +if ($count) { + BadWords::generate_regex(); + BadWords::put(); +} +return (1, @out); diff --git a/data/badword.new.issue b/data/badword.new.issue new file mode 100644 index 00000000..a0d897d8 --- /dev/null +++ b/data/badword.new.issue @@ -0,0 +1,128 @@ +ACBAR +AGBAR +AKELA +ALAH +ALHA +ANIHILATE +ANIHILATION +ANUS +ARSE +ATACK +AVENGER +BARENDERO +BARSTARD +BASTARD +BASURA +BINLADAN +BINLADEN +BLOD +BOLCK +BOLOCK +BOMB +BUGER +BUST +CABRON +CHRISTIAN +COCA +COCK +COJONES +CORNUDO +CORNUPETA +CORUPTO +CRAP +CUEZETA +CUNT +DAMN +DEADH +DEATH +DESGRACIADO +DETH +DICKHEAD +DUMBAS +DXFUN +ENFERMITO +ENFERMO +ENVIDIOSO +ESTUPIDO +EXPLOSIVE +FOLA +FUCK +FUK +FUNKER +HACKED +HIJOPUTA +HIJOS +HITLER +IDIOT +IMBECIL +JERK +JIHAD +JODAN +JODE +JODIENDO +JOETE +KIL +KLOT +LADEN +LADIN +LADRON +MAFIA +MAMON +MARICA +MARICONAZO +MASMURDER +MEGATON +MENTAL +MENTIROSO +MIERDA +MORO +MOTHERFUCKER +MULSIM +MURDER +MUSLIM +NUKE +OSAMA +PEDERASTA +PIS +PLONKER +POLA +POLITIC +POYA +PUDENDA +PUDENDUM +PUSY +PUTA +PUTIN +PUTO +RADIOBASURA +RAGHEAD +RATA +RELIGION +RKOAL +RKOKILER +RKOPUTIN +ROFRE +ROPUTIN +SHIT +SLAG +SLAUGHTER +SLAVE +SOD +SPOTWAR +STOPUTIN +STOPWAR +STUPID +SUBNORMAL +TERORIST +TIT +TONTO +TOSER +TOSPOT +TRUCK +TRUK +TWAT +URE +VENGADOR +WANK +WASOCK +WHORE diff --git a/perl/BadWords.pm b/perl/BadWords.pm index 312a0408..09911b4a 100644 --- a/perl/BadWords.pm +++ b/perl/BadWords.pm @@ -12,85 +12,244 @@ use strict; use DXUtil; use DXVars; -use DXHash; use DXDebug; use IO::File; -use vars qw($badword $regexcode); +our $regex; # the big bad regex generated from @relist +our @relist; # the list of regexes to try, record = [canonical word, regex] +my %in; # the collection of words we are building up and their regexes -our $regex; -# load the badwords file +# load the badwords file(s) sub load { - my $bwfn = localdata("badword"); - filecopy("$main::data.issue", $bwfn) unless -e $bwfn; - - my @out; + %in = (); + @relist = (); + $regex = ''; - $badword = new DXHash "badword"; + my @inw; + my @out; + my $wasold; - push @out, create_regex(); - return @out; -} -sub create_regex -{ - $regex = localdata("badw_regex"); - filecopy("$regex.gb.issue", $regex) unless -e $regex; + my $newfn = localdata("badword.new"); + filecopy("$main::data/badword.new.issue", $newfn) unless -e $newfn; + if (-e $newfn) { + # new style + dbg("BadWords: Found new style badword.new file"); + my $fh = new IO::File $newfn; + if ($fh) { + while (<$fh>) { + chomp; + next if /^\s*\#/; + add_regex(uc $_); + } + $fh->close; + @relist = sort {$a->[0] cmp $b->[0]} @relist; # just in case... + dbg("BadWords: " . scalar @relist . " new style badwords read"); + } + else { + my $l = "BadWords: can't open $newfn $!"; + dbg($l); + push @out, $l; + return @out; + } + } + else { + + # using old style files + my $bwfn = localdata("badword"); + filecopy("$main::data/badword.issue", $bwfn) unless -e $bwfn; - my @out; - my $fh = new IO::File $regex; + # parse the existing static file + dbg("BadWords: Using old style badword file"); - if ($fh) { - my $s = "sub { my \$str = shift; my \@out; \n"; - while (<$fh>) { - chomp; - next if /^\s*\#/; - my @list = split " "; - for (@list) { - # create a closure for each word so that it matches stuff with spaces/punctuation - # and repeated characters in it - my $w = uc $_; - my @l = split //, $w; - my $e = join '+[\s\W]*', @l; - $s .= qq{push \@out, \$1 if \$str =~ m|\\b($e+)|;\n}; + my $fh = new IO::File $bwfn; + if ($fh) { + my $line = 0; + while (<$fh>) { + chomp; + ++$line; + next if /^\s*\#/; + unless (/\w+\s+=>\s+\d+,/) { + dbg("BadWords: syntax error in $bwfn:$line '$_'"); + next; + } + my @line = split /\s+/, uc $_; + shift @line unless $line[0]; + push @inw, $line[0]; } + $fh->close; } - $s .= "return \@out;\n}"; - $regexcode = eval $s; - dbg($s) if isdbg('badword'); - if ($@) { - @out = ($@); - dbg($@); + else { + my $l = "BadWords: can't open $bwfn $!"; + dbg($l); + push @out, $l; return @out; } - $fh->close; - } else { - my $l = "can't open $regex $!"; - dbg($l); - push @out, $l; + + # do the same for badw_regex + my $regexfn = localdata("badw_regex"); + filecopy("$main::data/badw_regex.gb.issue", $regexfn) unless -e $regexfn; + dbg("BadWords: Using old style badw_regex file"); + $fh = new IO::File $regexfn; + + if ($fh) { + while (<$fh>) { + chomp; + next if /^\s*\#/; + next if /^\s*$/; + push @inw, split /\s+/, uc $_; + } + $fh->close; + } + else { + my $l = "BadWords: can't open $regexfn $!"; + dbg($l); + push @out, $l; + return @out; + } + + ++$wasold; } + + # catch most of the potential duplicates + @inw = sort @inw; + for (@inw) { + add_regex($_); + } + + # create the master regex + generate_regex(); + # use new style from now on + put() if $wasold; + + return @out; } +sub generate_regex +{ + my $res; + @relist = sort {$a->[0] cmp $b->[0]} @relist; + for (@relist) { + $res .= qq{(?:$_->[1]) |\n}; + } + $res =~ s/\s*\|\s*$//; + $regex = qr/\b($res)/x; +} + + +sub _cleanword +{ + my $w = uc shift; + $w =~ tr/01/OI/; # de-leet any incoming words + my $last = ''; # remove duplicate letters (eg BOLLOCKS > BOLOCKS) + my @w; + for (split //, $w) { + next if $last eq $_; + $last = $_; + push @w, $_; + } + return @w ? join('', @w) : ''; +} + +sub add_regex +{ + my @list = split /\s+/, shift; + my @out; + + for (@list) { + my $w = uc $_; + $w = _cleanword($w); + + next unless $w && $w =~ /^\w+$/; # has to be a word + next if $in{$w}; # ignore any we have already dealt with + next if _slowcheck($w); # check whether this will already be detected + + # re-leet word (in regex speak)if required + my @l = map { s/O/[O0]/g; s/I/[I1]/g; $_ } split //, $w; + my $e = join '+[\s\W]*', @l; + my $q = $e; + push @relist, [$w, $q]; + $in{$w} = $q; + dbg("$w = $q") if isdbg('badword'); + push @out, $w; + } + return @out; +} + +sub del_regex +{ + my @list = split /\s+/, shift; + my @out; + + for (@list) { + my $w = uc $_; + $w = _cleanword($w); + next unless $in{$w}; + delete $in{$w}; + @relist = grep {$_->[0] ne $w} @relist; + push @out, $w + } + return @out; +} + +sub list_regex +{ + my $full = shift; + return map { $full ? "$_->[0] = $_->[1]" : $_->[0] } @relist; +} + # check the text against the badwords list sub check { my $s = uc shift; my @out; - - push @out, &$regexcode($s) if $regexcode; - - return @out if @out; - for (split(/\b/, $s)) { - push @out, $_ if $badword->in($_); + if ($regex) { + my %uniq; + @out = grep {++$uniq{$_}; $uniq{$_} == 1 ? $_ : undef }($s =~ /\b($regex)/g); + dbg("BadWords: check '$s' = '" . join(', ', @out) . "'") if isdbg('badword'); + return @out; } + return _slowcheck($s) if @relist; + return; +} + +sub _slowcheck +{ + my $w = shift; + my @out; + + for (@relist) { + push @out, $w =~ /\b($_->[1])/; + } return @out; } +# write out the new bad words list +sub put +{ + my @out; + my $newfn = localdata("badword.new"); + my $fh = new IO::File ">$newfn"; + if ($fh) { + dbg("BadWords: put new badword.new file"); + @relist = sort {$a->[0] cmp $b->[0]} @relist; + for (@relist) { + print $fh "$_->[0]\n"; + } + $fh->close; + } + else { + my $l = "BadWords: can't open $newfn $!"; + dbg($l); + push @out, $l; + return @out; + } +} 1; diff --git a/perl/Editable.pm b/perl/Editable.pm index 0655fcf6..d2f15eee 100644 --- a/perl/Editable.pm +++ b/perl/Editable.pm @@ -39,7 +39,7 @@ sub addline my $dxchan = shift; my $line = shift; - if (my @ans = BadWord::check($line)) { + if (my @ans = BadWords::check($line)) { return ($dxchan->msg('e17', @ans)); } push @{$self->{lines}}, $line; @@ -53,7 +53,7 @@ sub modline my $no = shift; my $line = shift; - if (my @ans = BadWord::check($line)) { + if (my @ans = BadWords::check($line)) { return ($dxchan->msg('e17', @ans)); } ${$self->{lines}}[$no] = $line; diff --git a/perl/cluster.pl b/perl/cluster.pl index 718a2f17..baa48ca7 100755 --- a/perl/cluster.pl +++ b/perl/cluster.pl @@ -671,7 +671,7 @@ sub setup_start UDPMsg::init(\&new_channel); # load bad words - dbg("load badwords: " . (BadWords::load() or "Ok")); + BadWords::load(); # prime some signals unless ($DB::VERSION) {