added a query routine
[spider.git] / perl / ARRL / DX.pm
1 #
2 # (optional) ARRL Dx Database handling
3 #
4 # $Id$
5 #
6 # Copyright (c) 2005 Dirk Koopman G1TLH
7 #
8
9 use strict;
10
11 package ARRL::DX;
12
13 use vars qw($VERSION $BRANCH $dbh $dbname %tabledefs $error %stop $limit);
14
15 main::mkver($VERSION = q$Revision$) if main->can('mkver');
16
17 use DXLog;
18 use DXDebug;
19 use DXUtil;
20 use DBI;
21 use IO::File;
22 use Date::Parse;
23
24 $dbname = "$main::root/data/arrldx.db";
25 %tabledefs = (
26                           paragraph => 'CREATE TABLE paragraph(p text, t int, bullid text)',
27                           paragraph_t_idx => 'CREATE INDEX paragraph_t_idx ON paragraph(t DESC)',
28                           refer => 'CREATE TABLE refer(r text, rowid int, t int, pos int)',
29                           refer_id_idx => 'CREATE INDEX refer_id_idx ON refer(rowid)',
30                           refer_t_idx => 'CREATE INDEX refer_t_idx ON refer(t DESC)',
31                          );
32
33 %stop = (
34                  A => 1,
35                  ACTIVITY => 1,
36                  AND => 1,
37                  ARE => 1,
38                  AS => 1,
39                  AT => 1,
40                  BE => 1,
41                  BUT => 1,
42                  FOR => 1,
43                  FROM => 1,
44                  HAS => 1,
45                  HAVE => 1,
46                  HE => 1,
47                  I => 1,
48                  IF => 1,
49                  IN => 1,
50                  IS => 1,
51                  IT => 1,
52                  LOOK => 1,
53                  LOOKS => 1,
54                  NOT => 1,
55                  OF => 1,
56                  ON => 1,
57                  OR => 1,
58                  OUT => 1,
59                  SHE => 1,
60                  SO => 1,
61                  THAT => 1,
62                  THE => 1,
63                  THEM => 1,
64                  THEY => 1,
65                  THIS => 1,
66                  THIS => 1,
67                  TO => 1,
68                  WAS => 1,
69                  WHERE => 1,
70                  WILL => 1,
71                  WITH => 1,
72                  YOU => 1,
73
74                  JANUARY => 1,
75                  FEBRUARY => 1,
76                  MARCH => 1,
77                  APRIL => 1,
78                  MAY => 1,
79                  JUNE => 1,
80                  JULY => 1,
81                  AUGUST => 1,
82                  SEPTEMBER => 1,
83                  OCTOBER => 1,
84                  NOVEMBER => 1,
85                  DECEMBER => 1,
86                 );
87
88 $limit = 10;
89
90 sub do_connect
91 {
92         unless ($dbh) {
93                 $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "");
94                 unless ($dbh) {
95                         dbg($DBI::errstr);
96                         Log('err', $DBI::errstr);
97                         $error = $DBI::errstr;
98                         return;
99                 }
100         }
101         return $dbh;
102 }
103
104 sub drop
105 {
106         return unless do_connect();
107         my $sth = $dbh->prepare("select name,type from sqlite_master where type = 'table'") or $error = $DBI::errstr, return;
108         $sth->execute or $error = $DBI::errstr, return;
109         while (my @row = $sth->fetchrow_array) {
110                 $dbh->do("drop table $row[0]");
111         }
112         $sth->finish;
113 }
114
115 sub create
116 {
117         return unless $dbh;
118         
119         # check that all the tables are present and correct
120         my $sth = $dbh->prepare("select name,type from sqlite_master") or $error = $DBI::errstr, return;
121         $sth->execute or $error = $DBI::errstr, return;
122         my %f;
123         while (my @row = $sth->fetchrow_array) {
124                 $f{$row[0]} = $row[1];
125         }
126         foreach my $t (sort keys %tabledefs) {
127                 $dbh->do($tabledefs{$t}) unless exists $f{$t};
128         }
129         $sth->finish;
130 }
131
132 sub new
133 {
134         my $pkg = shift;
135         my $class = ref $pkg || $pkg;
136         my %args = @_;
137         
138         $error = undef;
139         
140         unless ($dbh) {
141                 return unless do_connect();
142                 create();
143         }
144
145         my $self = {};
146         
147         if ($args{file}) {
148                 if (ref $args{file}) {
149                         $self->{f} = $args{file};
150                 } else {
151                         $self->{f} = IO::File->new($args{file}) or $error = $!, return;
152                 }
153         } 
154         
155         return bless $self, $class; 
156 }
157
158 sub process
159 {
160         my $self = shift;
161
162         return unless $self->{f};
163         
164         my $state;
165         my $count;
166         
167         $dbh->begin_work;
168         my $f = $self->{f};
169         while (<$f>) {
170 #               print;
171                 unless ($state) {
172                         $state = 'ZC' if /^ZCZC/; 
173                 } elsif ($state eq 'ZC') {
174                         if (/\b(ARLD\d+)\b/) {
175                                 $self->{id} = $1;
176                                 $state = 'id';
177                         }
178                 } elsif ($state eq 'id') {
179                         if (/^Newington\s+CT\s+(\w+)\s+(\d+),\s+(\d+)/i) {
180                                 $state = 'date' ;
181                                 $self->{year} = $3;
182                                 $self->{date} = str2time("$1 $2 $3") if $state eq 'date';
183                         }
184                 } elsif ($state eq 'date') {
185                         if (/^$self->{id}/) {
186                                 last unless /DX\s+[Nn]ews\s*$/;
187                                 $state = 'week'; 
188                         }
189                 } elsif ($state eq 'week') {
190                         $state = 'weekro' if /^This\s+week/;
191                 } elsif ($state eq 'weekro') {
192                         if (/^\s*$/) {
193                                 $state = 'para';
194                                 $self->{para} = "";
195                         }
196                 } elsif ($state eq 'para') {
197                         if (/^\s*$/) {
198                                 if ($self->{para}) {
199                                         $self->{para} =~ s/^\s+//;
200                                         $self->{para} =~ s/\s+$//;
201                                         $self->{para} =~ s/\s+/ /g;
202                                         $self->insert;
203                                         $self->{para} = "";
204                                         $count++;
205                                 }
206                         } elsif (/^THIS\s+WEEKEND/) {
207                                 last;
208                         }
209                         chomp;
210                         s/^\s+//;
211                         s/\s+$//;
212                         $self->{para} .= $_ . ' ';
213                 }
214         }
215         $dbh->commit;
216         $self->{f}->close;
217         delete $self->{f};
218         return $count;
219 }
220
221 sub insert
222 {
223         my $self = shift;
224         my $sth = $dbh->prepare("insert into paragraph values(?,?,?)");
225         $sth->execute($self->{para}, $self->{date}, "$self->{year}-$self->{id}");
226         my $lastrow = $dbh->func('last_insert_rowid');
227         $sth->finish;
228         
229
230 #       my @w = split /[.,;:\s"'\$\%!£^&\*\(\)\[\]\{\}\#\<\>+=]+/, $self->{para};
231         my @w = split m|[\b\s]+|, $self->{para};
232 #       print join(' ', @w), "\n";
233         $sth = $dbh->prepare("insert into refer values(?,?,?,?)");
234         
235         my $i = 0;
236         for (@w) {
237                 
238                 # starts with a capital letter that isn't Q
239                 if (/^[A-PR-Z]/ || m|\d+[A-Z][-/A-Z0-9]*$|) {
240                         # not all digits
241                         next if /^\d+$/;
242                         
243                         # isn't a stop word
244                         my $w = uc;
245                         $w =~ s/\W+$//;
246                         unless ($stop{$w}) {
247                                 # add it into the word list
248                                 $sth->execute($w, $lastrow, $self->{date}, $i);
249 #                               print " $w";
250                         }
251                 }
252                 $i++;
253         }
254         $sth->finish;
255 }
256
257 sub query
258 {
259         my $self = shift;
260         my %args = @_;
261         my @out;
262         
263         if ($args{'q'}) {
264         my @w = map { s|[^-/\w]||g; uc $_ } split /\s+/, $args{'q'};
265                 if (@w) {
266                         my $s = qq{select distinct p, t, bullid from (select distinct rowid from refer where };
267                         while (@w) {
268                                 my $w = shift @w;
269                                 $s .= qq{r like '$w\%'};
270                                 $s .= ' or ' if @w;
271                         }
272                         my $l = $args{l}; 
273                         $l =~ s/[^\d]//g;
274                         $l ||= $limit;
275                         $s .= qq{ order by t desc limit $l), paragraph where paragraph.ROWID=rowid};
276                         my $sth = $dbh->prepare($s) or $error = $DBI::errstr, return @out;
277                         $sth->execute;
278                         while (my @row = $sth->fetchrow_array) {
279                                 push @out, \@row;
280                         }
281                         $sth->finish;
282                 }
283         }
284         return @out;
285 }
286
287 sub close
288 {
289         $dbh->disconnect;
290         undef $dbh;
291 }
292 1;