2c2fc5907f7b1ce8be88cf279546588976a5a941
[spider.git] / perl / Buck.pm
1 #!/usr/bin/perl -w
2
3 package Buck;
4
5 use HTML::Parser;
6 use Data::Dumper;
7 use DXUtil;
8
9 @ISA = qw( HTML::Parser );
10
11 use vars qw($VERSION $BRANCH);
12 $VERSION = sprintf( "%d.%03d", q$Revision$ =~ /(\d+)\.(\d+)/ );
13 $BRANCH = sprintf( "%d.%03d", q$Revision$ =~ /\d+\.\d+\.(\d+)\.(\d+)/  || (0,0));
14 $main::build += $VERSION;
15 $main::branch += $BRANCH;
16
17 use strict;
18
19 sub new
20 {
21     my $pkg = shift;
22         my $self = SUPER::new $pkg;
23         $self->{list} = [];
24         $self->{state} = 'pre';
25     $self->{sort} = undef;
26         $self->{debug} = 0;
27     $self->{call} = uc shift;
28         return $self;
29 }
30
31 sub start
32 {
33         my ($self, $tag, $attr, $attrseq, $origtext) = @_;
34         if ($self->{debug}) {
35                 print "$self->{state} $tag";
36         if ($attr) {
37                         my $dd = new Data::Dumper([$attr], [qw(attr)]);
38                         $dd->Terse(1);
39                         $dd->Indent(0);
40                         $dd->Quotekeys(0);
41                         print " ", $dd->Dumpxs;
42                 }
43                 print "\n";
44         }
45         if ($self->{state} eq 'pre' && $tag eq 'table') {
46                 $self->state('t1');
47         } elsif ($self->{state} eq 't1' && $tag eq 'table') {
48                 $self->state('t2');
49         } elsif ($self->{state} eq 't2' && $tag =~ /^h/) {
50                 $self->{addr} = "";
51                 $self->{laddr} = 0;
52                 $self->state('addr');
53         } elsif ($self->{state} eq 'addr') {
54                 if ($tag eq 'br') {
55                         $self->{addr} .= ", " if length $self->{addr} > $self->{laddr};
56                         $self->{laddr} = length $self->{addr};
57                 } elsif ($tag eq 'p') {
58             push @{$self->{list}}, $self->{addr} ? "$self->{call}|addr|$self->{addr}" : "$self->{call}|addr|unknown";
59                         $self->state('semail');
60                 }
61         } elsif ($self->{state} eq 'email') {
62                 if ($tag eq 'a') {
63                         my $email = $attr->{href};
64                         if ($email && $email =~ /mailto/i) {
65                                 $email =~ s/mailto://i;
66                                 push @{$self->{list}}, "$self->{call}|email|$email";
67                         }
68                 } elsif ($tag eq 'br' || $tag eq 'p') {
69                         $self->state('post');
70                 }
71         } elsif ($self->{state} eq 'post' && $tag eq 'form') {
72                 if (exists $self->{pos} && length $self->{pos}) {
73                         push @{$self->{list}}, "$self->{call}|location|$self->{pos}";
74                         $self->state('last');
75                 }
76         }
77 }
78
79 sub text
80 {
81         my ($self, $text) = @_;
82         $text =~ s/^[\s\r\n]+//g;
83         $text =~ s/[\s\r\n]+$//g;
84     print "$self->{state} text $text\n" if $self->{debug};      
85         if (length $text) {
86                 if ($self->{state} eq 'addr') {
87                         $text =~ s/\ //gi;
88                         $self->{addr} .= $text;
89                 } elsif ($self->{state} eq 'semail' && $text =~ /Email/i ) {
90                         $self->state('email');
91                 } elsif ($self->{state} eq 'post') {
92                         if ($text =~ /Latitude/i) {
93                                 $self->state('lat');
94                                 $self->{pos} = "" unless $self->{pos};
95                         } elsif ($text =~ /Longitude/i) {
96                                 $self->state('long');
97                                 $self->{pos} = "" unless $self->{pos};
98                         } elsif ($text =~ /Grid/i) {
99                                 $self->state('grid');
100                                 $self->{pos} = "" unless $self->{pos};
101                         }
102                 } elsif ($self->{state} eq 'lat') {
103                         my ($n, $l) = $text =~ /(\b[\d\.]+\b)\s+([NSns])/;
104                         $n = -$n if $l eq 'S' || $l eq 's';
105                         $self->{pos} = slat($n);
106                         $self->state('post');
107                 } elsif ($self->{state} eq 'long') {
108                         my ($n, $l) = $text =~ /(\b[\d\.]+\b)\s+([EWew])/;
109                         $n = -$n if $l eq 'W' || $l eq 'w';
110                         $self->{pos} .= "|" . slong($n);
111                         $self->state('post');
112                 } elsif ($self->{state} eq 'grid') {
113                         my ($qra) = $text =~ /(\b\w\w\d\d\w\w\b)/;
114                         $self->{pos} .= "|" . uc $qra;
115                         push @{$self->{list}}, "$self->{call}|location|$self->{pos}";
116                         $self->state('last');
117                 } elsif (($self->{state} eq 'pre' || $self->{state} =~ /^t/) && $text =~ /not\s+found/) {
118             push @{$self->{list}}, "$self->{call}|addr|unknown";
119                         $self->state('last');
120                 } elsif ($self->{state} eq 'email' && $text =~ /unknown/i) {
121                         $self->state('post');
122                 }
123         }
124 }
125
126 sub state
127 {
128         my $self = shift;
129         $self->{state} = shift if @_;
130         return $self->{state};
131 }
132
133 sub end
134 {
135         my ($self, $tag, $origtext) = @_;
136     print "$self->{state} /$tag\n" if $self->{debug};
137 }
138
139 sub debug
140 {
141         my ($self, $val) = @_;
142         $self->{debug} = $val;
143 }
144
145 sub answer
146 {
147         my $self = shift;
148         return @{$self->{list}};
149 }
150
151 1;
152