X-Git-Url: http://dxcluster.net/gitweb/gitweb.cgi?a=blobdiff_plain;f=perl%2FBadWords.pm;h=446bcf1e4f06ca9996504bcbd3445971c27bde80;hb=6a5fdda14c1dccca4e002618cc8a428b1dc163bd;hp=81e7ec90eee67c57a93bb52cb08b4079dea100c3;hpb=7346d1f8129457eee2e602b36e27435141533049;p=spider.git diff --git a/perl/BadWords.pm b/perl/BadWords.pm index 81e7ec90..446bcf1e 100644 --- a/perl/BadWords.pm +++ b/perl/BadWords.pm @@ -3,7 +3,7 @@ # # Copyright (c) 2000 Dirk Koopman # -# $Id$ +# # package BadWords; @@ -12,73 +12,244 @@ use strict; use DXUtil; use DXVars; -use DXHash; -use IO::File; +use DXDebug; -use vars qw($badword); +use IO::File; -my $oldfn = "$main::data/badwords"; -$badword = new DXHash "badword"; +our $regex; # the big bad regex generated from @relist +our @relist; # the list of regexes to try, record = [canonical word, regex] +my %in; # the collection of words we are building up and their regexes -use vars qw($VERSION $BRANCH); -$VERSION = sprintf( "%d.%03d", q$Revision$ =~ /(\d+)\.(\d+)/ ); -$BRANCH = sprintf( "%d.%03d", q$Revision$ =~ /\d+\.\d+\.(\d+)\.(\d+)/ ) || 0; -$main::build += $VERSION; -$main::branch += $BRANCH; -# load the badwords file +# load the badwords file(s) sub load { + %in = (); + @relist = (); + $regex = ''; + + my @inw; my @out; - return unless -e $oldfn; - my $fh = new IO::File $oldfn; + my $wasold; - if ($fh) { - while (<$fh>) { - chomp; - next if /^\s*\#/; - my @list = split " "; - for (@list) { - $badword->add($_); + + my $newfn = localdata("badword.new"); + filecopy("$main::data/badword.new.issue", $newfn) unless -e $newfn; + if (-e $newfn) { + # new style + dbg("BadWords: Found new style badword.new file"); + my $fh = new IO::File $newfn; + if ($fh) { + while (<$fh>) { + chomp; + next if /^\s*\#/; + add_regex(uc $_); } + $fh->close; + @relist = sort {$a->[0] cmp $b->[0]} @relist; # just in case... + dbg("BadWords: " . scalar @relist . " new style badwords read"); } - $fh->close; - $badword->put; - unlink $oldfn; - } else { - my $l = "can't open $oldfn $!"; - dbg($l); - push @out, $l; + else { + my $l = "BadWords: can't open $newfn $!"; + dbg($l); + push @out, $l; + return @out; + } + } + else { + + # using old style files + my $bwfn = localdata("badword"); + filecopy("$main::data/badword.issue", $bwfn) unless -e $bwfn; + + # parse the existing static file + dbg("BadWords: Using old style badword file"); + + my $fh = new IO::File $bwfn; + if ($fh) { + my $line = 0; + while (<$fh>) { + chomp; + ++$line; + next if /^\s*\#/; + unless (/\w+\s+=>\s+\d+,/) { + dbg("BadWords: syntax error in $bwfn:$line '$_'"); + next; + } + my @line = split /\s+/, uc $_; + shift @line unless $line[0]; + push @inw, $line[0]; + } + $fh->close; + } + else { + my $l = "BadWords: can't open $bwfn $!"; + dbg($l); + push @out, $l; + return @out; + } + + # do the same for badw_regex + my $regexfn = localdata("badw_regex"); + filecopy("$main::data/badw_regex.gb.issue", $regexfn) unless -e $regexfn; + dbg("BadWords: Using old style badw_regex file"); + $fh = new IO::File $regexfn; + + if ($fh) { + while (<$fh>) { + chomp; + next if /^\s*\#/; + next if /^\s*$/; + push @inw, split /\s+/, uc $_; + } + $fh->close; + } + else { + my $l = "BadWords: can't open $regexfn $!"; + dbg($l); + push @out, $l; + return @out; + } + + ++$wasold; + } + + # catch most of the potential duplicates + @inw = sort @inw; + for (@inw) { + add_regex($_); } + + # create the master regex + generate_regex(); + + # use new style from now on + put() if $wasold; + + return @out; } +sub generate_regex +{ + my $res; + @relist = sort {$a->[0] cmp $b->[0]} @relist; + for (@relist) { + $res .= qq{\\b(?:$_->[1]) |\n}; + } + $res =~ s/\s*\|\s*$//; + $regex = qr/\b($res)/x; +} + + +sub _cleanword +{ + my $w = uc shift; + $w =~ tr/01/OI/; # de-leet any incoming words + my $last = ''; # remove duplicate letters (eg BOLLOCKS > BOLOCKS) + my @w; + for (split //, $w) { + next if $last eq $_; + $last = $_; + push @w, $_; + } + return @w ? join('', @w) : ''; +} + +sub add_regex +{ + my @list = split /\s+/, shift; + my @out; + + for (@list) { + my $w = uc $_; + $w = _cleanword($w); + + next unless $w && $w =~ /^\w+$/; # has to be a word + next if $in{$w}; # ignore any we have already dealt with + next if _slowcheck($w); # check whether this will already be detected + + # re-leet word (in regex speak)if required + my @l = map { s/O/[O0]/g; s/I/[I1]/g; $_ } split //, $w; + my $e = join '+[\s\W]*', @l; + my $q = $e; + push @relist, [$w, $q]; + $in{$w} = $q; + dbg("$w = $q") if isdbg('badword'); + push @out, $w; + } + return @out; +} + +sub del_regex +{ + my @list = split /\s+/, shift; + my @out; + + for (@list) { + my $w = uc $_; + $w = _cleanword($w); + next unless $in{$w}; + delete $in{$w}; + @relist = grep {$_->[0] ne $w} @relist; + push @out, $w + } + return @out; +} + +sub list_regex +{ + my $full = shift; + return map { $full ? "$_->[0] = $_->[1]" : $_->[0] } @relist; +} + # check the text against the badwords list sub check { my $s = uc shift; + my @out; - for (split(/\s+/, $s)) { - s/[^\w]//g; - return $_ if $badword->in($_); - s/\'?S$//; - return $_ if $badword->in($_); + if ($regex) { + my %uniq; + @out = grep {++$uniq{$_}; $uniq{$_} == 1 ? $_ : undef }($s =~ /($regex)/g); + dbg("BadWords: check '$s' = '" . join(', ', @out) . "'") if isdbg('badword'); + return @out; } + return _slowcheck($s) if @relist; + return; +} + + +sub _slowcheck +{ + my $w = shift; + my @out; - # look for a few of the common ones with spaces and stuff - if ($s =~ /F[\s\W]*U[\s\W]*C[\s\W]*K/) { - return "FUCK"; - } elsif ($s =~ /C[\s\W]*U[\s\W]*N[\s\W]*T/) { - return "CUNT"; - } elsif ($s =~ /W[\s\W]*A[\s\W]*N[\s\W]*K/) { - return "WANK"; - } elsif ($s =~ /C[\s\W]*[0O][\s\W]*C[\s\W]*K/) { - return "COCK"; - } elsif ($s =~ /S[\s\W]*H[\s\W]*[I1][\s\W]*T/) { - return "SHIT"; + for (@relist) { + push @out, $w =~ /\b($_->[1])/; } - - return undef; + return @out; } +# write out the new bad words list +sub put +{ + my @out; + my $newfn = localdata("badword.new"); + my $fh = new IO::File ">$newfn"; + if ($fh) { + dbg("BadWords: put new badword.new file"); + @relist = sort {$a->[0] cmp $b->[0]} @relist; + for (@relist) { + print $fh "$_->[0]\n"; + } + $fh->close; + } + else { + my $l = "BadWords: can't open $newfn $!"; + dbg($l); + push @out, $l; + return @out; + } +} 1;