Ticket #92: Bayes.pm

File Bayes.pm, 139.7 kB (added by amatubu, 3 years ago)

Replacement of Bayes.pm

Line 
1 # POPFILE LOADABLE MODULE
2 package Classifier::Bayes;
3
4 use POPFile::Module;
5 @ISA = ("POPFile::Module");
6
7 #----------------------------------------------------------------------------
8 #
9 # Bayes.pm --- Naive Bayes text classifier
10 #
11 # Copyright (c) 2001-2008 John Graham-Cumming
12 #
13 #   This file is part of POPFile
14 #
15 #   POPFile is free software; you can redistribute it and/or modify it
16 #   under the terms of version 2 of the GNU General Public License as
17 #   published by the Free Software Foundation.
18 #
19 #   POPFile is distributed in the hope that it will be useful,
20 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
21 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 #   GNU General Public License for more details.
23 #
24 #   You should have received a copy of the GNU General Public License
25 #   along with POPFile; if not, write to the Free Software
26 #   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 #
28 #   Modified by              Sam Schinke    (sschinke@users.sourceforge.net)
29 #   Merged with db code from Scott Leighton (helphand@users.sourceforge.net)
30 #
31 #----------------------------------------------------------------------------
32
33 use strict;
34 use warnings;
35 use locale;
36 use Classifier::MailParse;
37 use IO::Handle;
38 use DBI;
39 use Digest::MD5 qw( md5_hex );
40 use MIME::Base64;
41 use File::Copy;
42
43 # This is used to get the hostname of the current machine
44 # in a cross platform way
45
46 use Sys::Hostname;
47
48 # A handy variable containing the value of an EOL for networks
49
50 my $eol = "\015\012";
51
52 # Korean characters definition
53
54 my $ksc5601_sym = '(?:[\xA1-\xAC][\xA1-\xFE])';
55 my $ksc5601_han = '(?:[\xB0-\xC8][\xA1-\xFE])';
56 my $ksc5601_hanja  = '(?:[\xCA-\xFD][\xA1-\xFE])';
57 my $ksc5601 = "(?:$ksc5601_sym|$ksc5601_han|$ksc5601_hanja)";
58
59 my $eksc = "(?:$ksc5601|[\x81-\xC6][\x41-\xFE])"; #extended ksc
60
61 #----------------------------------------------------------------------------
62 # new
63 #
64 #   Class new() function
65 #----------------------------------------------------------------------------
66 sub new
67 {
68     my $type = shift;
69     my $self = POPFile::Module->new();
70
71     # Set this to 1 to get scores for individual words in message detail
72
73     $self->{wordscores__}        = 0;
74
75     # Choice for the format of the "word matrix" display.
76
77     $self->{wmformat__}          = '';
78
79     # Just our hostname
80
81     $self->{hostname__}        = '';
82
83     # File Handle for DBI database
84
85     $self->{db__}                = {};
86
87     $self->{history__}        = 0;
88
89     # To save time we also 'prepare' some commonly used SQL statements
90     # and cache them here, see the function db_connect__ for details
91
92     $self->{db_get_buckets__} = 0;
93     $self->{db_get_wordid__} = 0;
94     $self->{db_get_word_count__} = 0;
95     $self->{db_put_word_count__} = 0;
96     $self->{db_get_bucket_unique_counts__} = 0;
97     $self->{db_get_unique_word_count__} = 0;
98     $self->{db_get_bucket_word_counts__} = 0;
99     $self->{db_get_full_total__} = 0;
100     $self->{db_get_bucket_parameter__} = 0;
101     $self->{db_set_bucket_parameter__} = 0;
102     $self->{db_get_bucket_parameter_default__} = 0;
103     $self->{db_get_buckets_with_magnets__} = 0;
104     $self->{db_delete_zero_words__} = 0;
105
106     # Caches the name of each bucket and relates it to both the bucket
107     # ID in the database and whether it is pseudo or not
108     #
109     # Subkeys used are:
110     #
111     # id     The bucket ID in the database
112     # pseudo 1 if this is a pseudo bucket
113
114     $self->{db_bucketid__}       = {};
115
116     # Caches the IDs that map to parameter types
117
118     $self->{db_parameterid__}    = {};
119
120     # Caches looked up parameter values on a per bucket basis
121
122     $self->{db_parameters__}     = {};
123
124     # Used to parse mail messages
125     $self->{parser__}            = new Classifier::MailParse;
126
127     # The possible colors for buckets
128     $self->{possible_colors__} = [ 'red',       'green',      'blue',       'brown', # PROFILE BLOCK START
129                                    'orange',    'purple',     'magenta',    'gray',
130                                    'plum',      'silver',     'pink',       'lightgreen',
131                                    'lightblue', 'lightcyan',  'lightcoral', 'lightsalmon',
132                                    'lightgrey', 'darkorange', 'darkcyan',   'feldspar',
133                                    'black' ];                                        # PROFILE BLOCK STOP
134
135     # Precomputed per bucket probabilities
136     $self->{bucket_start__}      = {};
137
138     # A very unlikely word
139     $self->{not_likely__}        = {};
140
141     # The expected corpus version
142     #
143     # DEPRECATED  This is only used when upgrading old flat file corpus files
144     #             to the database
145     $self->{corpus_version__}    = 1;
146
147     # The unclassified cutoff this value means that the top
148     # probabilily must be n times greater than the second probability,
149     # default is 100 times more likely
150     $self->{unclassified__}      = log(100);
151
152     # Used to tell the caller whether a magnet was used in the last
153     # mail classification
154     $self->{magnet_used__}       = 0;
155     $self->{magnet_detail__}     = 0;
156
157     # This maps session keys (long strings) to user ids.  If there's
158     # an entry here then the session key is valid and can be used in
159     # the POPFile API.  See the methods get_session_key and
160     # release_session_key for details
161
162     $self->{api_sessions__}      = {};
163
164     # Used to indicate whether we are using SQLite and what the full
165     # path and name of the database is if we are.
166
167     $self->{db_is_sqlite__}      = 0;
168     $self->{db_name__}           = '';
169
170     # Must call bless before attempting to call any methods
171
172     bless $self, $type;
173
174     $self->name( 'bayes' );
175
176     return $self;
177 }
178
179 #----------------------------------------------------------------------------
180 #
181 # forked
182 #
183 # This is called inside a child process that has just forked, since
184 # the child needs access to the database we open it
185 #
186 #----------------------------------------------------------------------------
187 sub forked
188 {
189     my ( $self ) = @_;
190
191     $self->db_connect__();
192 }
193
194 #----------------------------------------------------------------------------
195 #
196 # childexit
197 #
198 # This is called inside a child process that is about to finish, since
199 # the child does not need access to the database we close it
200 #
201 #----------------------------------------------------------------------------
202 sub childexit
203 {
204     my ( $self ) = @_;
205
206     $self->db_disconnect__();
207 }
208
209 #----------------------------------------------------------------------------
210 #
211 # initialize
212 #
213 # Called to set up the Bayes module's parameters
214 #
215 #----------------------------------------------------------------------------
216 sub initialize
217 {
218     my ( $self ) = @_;
219
220     # This is the name for the database
221
222     $self->config_( 'database', 'popfile.db' );
223
224     # This is the 'connect' string used by DBI to connect to the
225     # database, if you decide to change from using SQLite to some
226     # other database (e.g. MySQL, Oracle, ... ) this *should* be all
227     # you need to change.  The additional parameters user and auth are
228     # needed for some databases.
229     #
230     # Note that the dbconnect string
231     # will be interpolated before being passed to DBI and the variable
232     # $dbname can be used within it and it resolves to the full path
233     # to the database named in the database parameter above.
234
235     $self->config_( 'dbconnect', 'dbi:SQLite:dbname=$dbname' );
236     $self->config_( 'dbuser', '' ); $self->config_( 'dbauth', '' );
237
238     # SQLite 1.05+ had some problems we've resolved.
239     # This parameter is no longer used but we leave it for future use
240
241     $self->config_( 'bad_sqlite_version', '4.0.0' );
242
243     # No default unclassified weight is the number of times more sure
244     # POPFile must be of the top class vs the second class, default is
245     # 100 times more
246
247     $self->config_( 'unclassified_weight', 100 );
248
249     # The corpus is kept in the 'corpus' subfolder of POPFile
250     #
251     # DEPRECATED This is only used to find an old corpus that might
252     # need to be upgraded
253
254     $self->config_( 'corpus', 'corpus' );
255
256     # The characters that appear before and after a subject
257     # modification
258
259     $self->config_( 'subject_mod_left',  '[' );
260     $self->config_( 'subject_mod_right', ']' );
261
262     # Get the hostname for use in the X-POPFile-Link header
263
264     $self->{hostname__} = hostname;
265
266     # Allow the user to override the hostname
267
268     $self->config_( 'hostname', $self->{hostname__} );
269
270     # If set to 1 then the X-POPFile-Link will have < > around the URL
271     # (i.e. X-POPFile-Link: <http://foo.bar>) when set to 0 there are
272     # none (i.e. X-POPFile-Link: http://foo.bar)
273
274     $self->config_( 'xpl_angle', 0 );
275
276     # This parameter is used when the UI is operating in Stealth Mode.
277     # If left blank (the default setting) the X-POPFile-Link will use 127.0.0.1
278     # otherwise it will use this string instead. The system's HOSTS file should
279     # map the string to 127.0.0.1
280
281     $self->config_( 'localhostname', '' );
282
283     # This is a bit mask used to control options when we are using the
284     # default SQLite database.  By default all the options are on.
285     #
286     # 1 = Asynchronous deletes
287     # 2 = Backup database every hour
288
289     $self->config_( 'sqlite_tweaks', 0xFFFFFFFF );
290
291     # Japanese wakachigaki parser ('kakasi' or 'mecab' or 'internal').
292     $self->config_( 'nihongo_parser', 'kakasi' );
293
294     $self->mq_register_( 'COMIT', $self );
295     $self->mq_register_( 'RELSE', $self );
296
297     # Register for the TICKD message which is sent hourly by the
298     # Logger module.  We use this to hourly save the database if bit 1
299     # of the sqlite_tweaks is set and we are using SQLite
300
301     $self->mq_register_( 'TICKD', $self );
302
303     return 1;
304 }
305
306 #----------------------------------------------------------------------------
307 #
308 # deliver
309 #
310 # Called by the message queue to deliver a message
311 #
312 # There is no return value from this method
313 #
314 #----------------------------------------------------------------------------
315 sub deliver
316 {
317     my ( $self, $type, @message ) = @_;
318
319     if ( $type eq 'COMIT' ) {
320         $self->classified( $message[0], $message[2] );
321     }
322
323     if ( $type eq 'RELSE' ) {
324         $self->release_session_key_private__( $message[0] );
325     }
326
327     if ( $type eq 'TICKD' ) {
328         $self->backup_database__();
329     }
330 }
331
332 #----------------------------------------------------------------------------
333 #
334 # start
335 #
336 # Called to start the Bayes module running
337 #
338 #----------------------------------------------------------------------------
339 sub start
340 {
341     my ( $self ) = @_;
342
343     # In Japanese or Korean or Chinese mode, explicitly set LC_COLLATE to C.
344     #
345     # This is to avoid Perl crash on Windows because default
346     # LC_COLLATE of Japanese Win is Japanese_Japan.932(Shift_JIS),
347     # which is different from the charset POPFile uses for Japanese
348     # characters(EUC-JP).
349
350     my $lang = $self->module_config_( 'html', 'language' ) || '';
351
352     if ( $lang =~ /^(Nihongo$|Korean$|Chinese)/ ) {
353         use POSIX qw( locale_h );
354         setlocale( LC_COLLATE, 'C' );
355     }
356
357     # Pass in the current interface language for language specific parsing
358
359     $self->{parser__}->{lang__}  = $lang;
360     $self->{unclassified__} = log( $self->config_( 'unclassified_weight' ) );
361
362     if ( !$self->db_connect__() ) {
363         return 0;
364     }
365
366     if ( $lang eq 'Nihongo' ) {
367         # Setup Nihongo (Japanese) parser.
368
369         my $nihongo_parser = $self->config_( 'nihongo_parser' );
370
371         $nihongo_parser = $self->{parser__}->setup_nihongo_parser( $nihongo_parser );
372
373         $self->log_( 2, "Use Nihongo (Japanese) parser : $nihongo_parser" );
374         $self->config_( 'nihongo_parser', $nihongo_parser );
375
376         # Since Text::Kakasi is not thread-safe, we use it under the
377         # control of a Mutex to avoid a crash if we are running on
378         # Windows and using the fork.
379
380         if ( ( $nihongo_parser eq 'kakasi' ) && ( $^O eq 'MSWin32' ) &&
381              ( ( ( $self->module_config_( 'pop3', 'enabled' ) ) &&
382                  ( $self->module_config_( 'pop3', 'force_fork' ) ) ) ||
383                ( ( $self->module_config_( 'nntp', 'enabled' ) ) &&
384                  ( $self->module_config_( 'nntp', 'force_fork' ) ) ) ||
385                ( ( $self->module_config_( 'smtp', 'enabled' ) ) &&
386                  ( $self->module_config_( 'smtp', 'force_fork' ) ) ) ) ) {
387             $self->{parser__}->{need_kakasi_mutex__} = 1;
388
389             # Prepare the Mutex.
390             require POPFile::Mutex;
391             $self->{parser__}->{kakasi_mutex__} = new POPFile::Mutex( 'mailparse_kakasi' );
392             $self->log_( 2, "Create mutex for Kakasi." );
393         }
394     }
395
396     $self->upgrade_predatabase_data__();
397
398     return 1;
399 }
400
401 #----------------------------------------------------------------------------
402 #
403 # stop
404 #
405 # Called when POPFile is terminating
406 #
407 #----------------------------------------------------------------------------
408 sub stop
409 {
410     my ( $self ) = @_;
411
412     $self->db_disconnect__();
413     delete $self->{parser__};
414 }
415
416 #----------------------------------------------------------------------------
417 #
418 # classified
419 #
420 # Called to inform the module about a classification event
421 #
422 # There is no return value from this method
423 #
424 #----------------------------------------------------------------------------
425 sub classified
426 {
427     my ( $self, $session, $class ) = @_;
428
429     $self->set_bucket_parameter( $session, $class, 'count',             # PROFILE BLOCK START
430         $self->get_bucket_parameter( $session, $class, 'count' ) + 1 ); # PROFILE BLOCK STOP
431 }
432
433 #----------------------------------------------------------------------------
434 #
435 # backup_database__
436 #
437 # Called when the TICKD message is received each hour and if we are using
438 # the default SQLite database will make a copy with the .backup extension
439 #
440 #----------------------------------------------------------------------------
441 sub backup_database__
442 {
443     my ( $self ) = @_;
444
445     # If database backup is turned on and we are using SQLite then
446     # backup the database by copying it
447
448     if ( ( $self->config_( 'sqlite_tweaks' ) & 2 ) &&
449          $self->{db_is_sqlite__} ) {
450         if ( !copy( $self->{db_name__}, $self->{db_name__} . ".backup" ) ) {
451         $self->log_( 0, "Failed to backup database ".$self->{db_name__} );
452         }
453     }
454 }
455
456 #----------------------------------------------------------------------------
457 #
458 # tweak_sqlite
459 #
460 # Called when a module wants is to tweak access to the SQLite database.
461 #
462 # $tweak    The tweak to apply (a bit in the sqlite_tweaks mask)
463 # $state    1 to enable the tweak, 0 to disable
464 # $db       The db handle to tweak
465 #
466 #----------------------------------------------------------------------------
467 sub tweak_sqlite
468 {
469     my ( $self, $tweak, $state, $db ) = @_;
470
471     if ( $self->{db_is_sqlite__} &&
472          ( $self->config_( 'sqlite_tweaks' ) & $tweak ) ) {
473
474         $self->log_( 1, "Performing tweak $tweak to $state" );
475
476         if ( $tweak == 1 ) {
477             my $sync = $state?'off':'normal';
478             $db->do( "pragma synchronous=$sync;" );
479         }
480     }
481 }
482
483 #----------------------------------------------------------------------------
484 #
485 # reclassified
486 #
487 # Called to inform the module about a reclassification from one bucket
488 # to another
489 #
490 # session            Valid API session
491 # bucket             The old bucket name
492 # newbucket          The new bucket name
493 # undo               1 if this is an undo operation
494 #
495 # There is no return value from this method
496 #
497 #----------------------------------------------------------------------------
498 sub reclassified
499 {
500     my ( $self, $session, $bucket, $newbucket, $undo ) = @_;
501
502     $self->log_( 0, "Reclassification from $bucket to $newbucket" );
503
504     my $c = $undo?-1:1;
505
506     if ( $bucket ne $newbucket ) {
507         my $count = $self->get_bucket_parameter(
508                         $session, $newbucket, 'count' );
509         my $newcount = $count + $c;
510         $newcount = 0 if ( $newcount < 0 );
511         $self->set_bucket_parameter(
512             $session, $newbucket, 'count', $newcount );
513
514         $count = $self->get_bucket_parameter(
515                      $session, $bucket, 'count' );
516         $newcount = $count - $c;
517         $newcount = 0 if ( $newcount < 0 );
518         $self->set_bucket_parameter(
519             $session, $bucket, 'count', $newcount );
520
521         my $fncount = $self->get_bucket_parameter(
522                           $session, $newbucket, 'fncount' );
523         my $newfncount = $fncount + $c;
524         $newfncount = 0 if ( $newfncount < 0 );
525         $self->set_bucket_parameter(
526             $session, $newbucket, 'fncount', $newfncount );
527
528         my $fpcount = $self->get_bucket_parameter(
529                           $session, $bucket, 'fpcount' );
530         my $newfpcount = $fpcount + $c;
531         $newfpcount = 0 if ( $newfpcount < 0 );
532         $self->set_bucket_parameter(
533             $session, $bucket, 'fpcount', $newfpcount );
534     }
535 }
536
537 #----------------------------------------------------------------------------
538 #
539 # get_color
540 #
541 # Retrieves the color for a specific word, color is the most likely bucket
542 #
543 # $session  Session key returned by get_session_key
544 # $word     Word to get the color of
545 #
546 #----------------------------------------------------------------------------
547 sub get_color
548 {
549     my ( $self, $session, $word ) = @_;
550
551     my $max   = -10000;
552     my $color = 'black';
553
554     for my $bucket ($self->get_buckets( $session )) {
555         my $prob = $self->get_value_( $session, $bucket, $word );
556
557         if ( $prob != 0 )  {
558             if ( $prob > $max )  {
559                 $max   = $prob;
560                 $color = $self->get_bucket_parameter( $session, $bucket,
561                              'color' );
562             }
563         }
564     }
565
566     return $color;
567 }
568
569 #----------------------------------------------------------------------------
570 #
571 # get_not_likely_
572 #
573 # Returns the probability of a word that doesn't appear
574 #
575 #----------------------------------------------------------------------------
576 sub get_not_likely_
577 {
578     my ( $self, $session ) = @_;
579
580     my $userid = $self->valid_session_key__( $session );
581     return undef if ( !defined( $userid ) );
582
583     return $self->{not_likely__}{$userid};
584 }
585
586 #----------------------------------------------------------------------------
587 #
588 # get_value_
589 #
590 # Returns the value for a specific word in a bucket.  The word is
591 # converted to the log value of the probability before return to get
592 # the raw value just hit the hash directly or call get_base_value_
593 #
594 #----------------------------------------------------------------------------
595 sub get_value_
596 {
597     my ( $self, $session, $bucket, $word ) = @_;
598
599     my $value = $self->db_get_word_count__( $session, $bucket, $word );
600
601     if ( defined( $value ) && ( $value > 0 ) ) {
602
603         # Profiling notes:
604         #
605         # I tried caching the log of the total value and then doing
606         # log( $value ) - $cached and this turned out to be
607         # much slower than this single log with a division in it
608
609         return log( $value /
610                     $self->get_bucket_word_count( $session, $bucket ) );
611     } else {
612         return 0;
613     }
614 }
615
616 sub get_base_value_
617 {
618     my ( $self, $session, $bucket, $word ) = @_;
619
620     my $value = $self->db_get_word_count__( $session, $bucket, $word );
621
622     if ( defined( $value ) ) {
623         return $value;
624     } else {
625         return 0;
626     }
627 }
628
629 #----------------------------------------------------------------------------
630 #
631 # set_value_
632 #
633 # Sets the value for a word in a bucket and updates the total word
634 # counts for the bucket and globally
635 #
636 #----------------------------------------------------------------------------
637 sub set_value_
638 {
639     my ( $self, $session, $bucket, $word, $value ) = @_;
640
641     if ( $self->db_put_word_count__( $session, $bucket,
642              $word, $value ) == 1 ) {
643
644         # If we set the word count to zero then clean it up by deleting the
645         # entry
646
647         my $userid = $self->valid_session_key__( $session );
648         my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
649         $self->validate_sql_prepare_and_execute( $self->{db_delete_zero_words__}, $bucketid );
650
651         return 1;
652     } else {
653         return 0;
654     }
655 }
656
657 #----------------------------------------------------------------------------
658 #
659 # get_sort_value_ behaves the same as get_value_, except that it
660 # returns not_likely__ rather than 0 if the word is not found.  This
661 # makes its result more suitable as a sort key for bucket ranking.
662 #
663 #----------------------------------------------------------------------------
664 sub get_sort_value_
665 {
666     my ( $self, $session, $bucket, $word ) = @_;
667
668     my $v = $self->get_value_( $session, $bucket, $word );
669
670     if ( $v == 0 ) {
671
672         my $userid = $self->valid_session_key__( $session );
673         return undef if ( !defined( $userid ) );
674
675         return $self->{not_likely__}{$userid};
676     } else {
677         return $v;
678     }
679 }
680
681 #----------------------------------------------------------------------------
682 #
683 # update_constants__
684 #
685 # Updates not_likely and bucket_start
686 #
687 #----------------------------------------------------------------------------
688 sub update_constants__
689 {
690     my ( $self, $session ) = @_;
691
692     my $wc = $self->get_word_count( $session );
693
694     my $userid = $self->valid_session_key__( $session );
695     return undef if ( !defined( $userid ) );
696
697     if ( defined( $wc ) && $wc > 0 )  {
698         $self->{not_likely__}{$userid} = -log( 10 * $wc );
699
700         foreach my $bucket ($self->get_buckets( $session )) {
701             my $total = $self->get_bucket_word_count( $session, $bucket );
702
703             if ( $total != 0 ) {
704                 $self->{bucket_start__}{$userid}{$bucket} = log( $total /
705                                                                  $wc );
706             } else {
707                 $self->{bucket_start__}{$userid}{$bucket} = 0;
708             }
709         }
710     } else {
711         $self->{not_likely__}{$userid} = 0;
712     }
713 }
714
715 #----------------------------------------------------------------------------
716 #
717 # db_connect__
718 #
719 # Connects to the POPFile database and returns 1 if successful
720 #
721 #----------------------------------------------------------------------------
722 sub db_connect__
723 {
724     my ( $self ) = @_;
725
726     # Connect to the database, note that the database must exist for
727     # this to work, to make this easy for people POPFile we will
728     # create the database automatically here using the file
729     # 'popfile.sql' which should be located in the same directory the
730     # Classifier/Bayes.pm module
731
732     # If we are using SQLite then the dbname is actually the name of a
733     # file, and hence we treat it like one, otherwise we leave it
734     # alone
735
736     my $dbname;
737     my $dbconnect = $self->config_( 'dbconnect' );
738     my $dbpresent;
739     my $sqlite = ( $dbconnect =~ /sqlite/i );
740     my $mysql  = ( $dbconnect =~ /mysql/i );
741     my %connection_options = ();
742
743     if ( $sqlite ) {
744         $dbname = $self->get_user_path_( $self->config_( 'database' ) );
745         $dbpresent = ( -e $dbname ) || 0;
746     } else {
747         $dbname = $self->config_( 'database' );
748         $dbpresent = 1;
749
750         if ( $mysql ) {
751
752             # Turn on auto_reconnect
753
754             $connection_options{mysql_auto_reconnect} = 1;
755         }
756     }
757
758     # Record whether we are using SQLite or not and the name of the
759     # database so that other routines can access it; this is used by
760     # the backup_database__ routine to make a backup copy of the
761     # database when using SQLite.
762
763     $self->{db_is_sqlite__} = $sqlite;
764     $self->{db_name__}      = $dbname;
765
766     # Now perform the connect, note that this is database independent
767     # at this point, the actual database that we connect to is defined
768     # by the dbconnect parameter.
769
770     $dbconnect =~ s/\$dbname/$dbname/g;
771
772     $self->log_( 0, "Attempting to connect to $dbconnect ($dbpresent)" );
773
774     my $need_convert = 0;
775     my $old_dbh;
776
777     if ( $sqlite && $dbpresent ) {
778
779         # Check if the database is SQLite2 format
780
781         open DBFILE, $dbname;
782         my $buffer;
783         my $readed = sysread( DBFILE, $buffer, 47 );
784         close DBFILE;
785
786         if ( $buffer eq '** This file contains an SQLite 2.1 database **' ) {
787             $self->log_( 0, 'SQLite 2 database found. Try to upgrade' );
788
789             # Test DBD::SQLite version
790
791             my $ver = -1;
792             eval {
793                 require DBD::SQLite;
794                 $ver = $DBD::SQLite::VERSION;
795             };
796
797             if ( $ver >= 1.00 ) {
798                 $self->log_( 0, "DBD::SQLite $ver found" );
799
800                 # Backup SQLite2 database
801
802                 my $old_dbname = $dbname . '-sqlite2';
803                 unlink $old_dbname;
804                 rename $dbname, $old_dbname;
805
806                 # Connect to SQLite2 database
807
808                 my $old_dbconnect = $self->config_( 'dbconnect' );
809                 $old_dbconnect =~ s/SQLite:/SQLite2:/;
810                 $old_dbconnect =~ s/\$dbname/$old_dbname/g;
811
812                 $old_dbh = DBI->connect( $old_dbconnect,               # PROFILE BLOCK START
813                                          $self->config_( 'dbuser' ),
814                                          $self->config_( 'dbauth' ) ); # PROFILE BLOCK STOP
815
816                 # Update the config file
817
818                 $dbconnect = $self->config_( 'dbconnect' );
819                 $dbconnect =~ s/SQLite2:/SQLite:/;
820                 $self->config_( 'dbconnect', $dbconnect );
821                 $dbconnect =~ s/\$dbname/$dbname/g;
822
823                 $need_convert = 1;
824             }
825         } else {
826
827             # Update the config file
828
829             $dbconnect = $self->config_( 'dbconnect' );
830             $dbconnect =~ s/SQLite2:/SQLite:/;
831             $self->config_( 'dbconnect', $dbconnect );
832             $dbconnect =~ s/\$dbname/$dbname/g;
833         }
834     }
835
836     $self->{db__} = DBI->connect( $dbconnect,                 # PROFILE BLOCK START
837                                   $self->config_( 'dbuser' ),
838                                   $self->config_( 'dbauth' ),
839                                   \%connection_options );     # PROFILE BLOCK STOP
840
841     if ( !defined( $self->{db__} ) ) {
842         $self->log_( 0, "Failed to connect to database and got error $DBI::errstr" );
843         return 0;
844     }
845
846     if ( $sqlite ) {
847         $self->log_( 0, "Using SQLite library version " . $self->{db__}{sqlite_version} );
848
849         if ( $need_convert ) {
850             $self->log_( 0, 'Convert SQLite2 database to SQLite3 database' );
851
852             $self->db_upgrade__( $old_dbh );
853             $old_dbh->disconnect;
854
855             $self->log_( 0, 'Database convert completed' );
856         }
857
858         # For Japanese compatibility
859
860         if ( $self->{parser__}->{lang__} eq 'Nihongo' ) {
861             $self->{db__}->do( 'pragma case_sensitive_like=1;' );
862         }
863     }
864
865     if ( !$dbpresent ) {
866         if ( !$self->insert_schema__( $sqlite ) ) {
867             return 0;
868         }
869     }
870
871     # Now check for a need to upgrade the database because the schema
872     # has been changed.  From POPFile v0.22.0 there's a special
873     # 'popfile' table inside the database that contains the schema
874     # version number.  If the version number doesn't match or is
875     # missing then do the upgrade.
876
877     open SCHEMA, '<' . $self->get_root_path_( 'Classifier/popfile.sql' );
878     <SCHEMA> =~ /-- POPFILE SCHEMA (\d+)/;
879     my $version = $1;
880     close SCHEMA;
881
882     my $need_upgrade = 1;
883
884     #
885     # retrieve the SQL_IDENTIFIER_QUOTE_CHAR for the database then use it
886     # to strip off any sqlquotechars from the table names we retrieve
887     #
888
889     my $sqlquotechar = $self->{db__}->get_info(29) || '';
890     my @tables = map { s/$sqlquotechar//g; $_ } ($self->{db__}->tables());
891
892     foreach my $table (@tables) {
893         if ( $table =~ /\.?popfile$/ ) {
894             my @row = $self->{db__}->selectrow_array(
895                'select version from popfile;' );
896
897             if ( $#row == 0 ) {
898                 $need_upgrade = ( $row[0] != $version );
899             }
900         }
901     }
902
903     if ( $need_upgrade ) {
904
905         print "\n\nDatabase schema is outdated, performing automatic upgrade\n";
906
907         # The database needs upgrading
908
909         $self->db_upgrade__();
910
911         print "\nDatabase upgrade complete\n\n";
912     }
913
914     # Now prepare common SQL statements for use, as a matter of convention the
915     # parameters to each statement always appear in the following order:
916     #
917     # user
918     # bucket
919     # word
920     # parameter
921
922     $self->{db_get_buckets__} = $self->{db__}->prepare(                                 # PROFILE BLOCK START
923              'select name, id, pseudo from buckets
924                   where buckets.userid = ?;' );                                         # PROFILE BLOCK STOP
925
926     $self->{db_get_wordid__} = $self->{db__}->prepare(                                  # PROFILE BLOCK START
927              'select id from words
928                   where words.word = ? limit 1;' );                                     # PROFILE BLOCK STOP
929
930     $self->{db_get_userid__} = $self->{db__}->prepare(                                  # PROFILE BLOCK START
931              'select id from users where name = ?
932                                      and password = ? limit 1;' );                      # PROFILE BLOCK STOP
933
934     $self->{db_get_word_count__} = $self->{db__}->prepare(                              # PROFILE BLOCK START
935              'select matrix.times from matrix
936                   where matrix.bucketid = ? and
937                         matrix.wordid = ? limit 1;' );                                  # PROFILE BLOCK STOP
938
939     $self->{db_put_word_count__} = $self->{db__}->prepare(                              # PROFILE BLOCK START
940            'replace into matrix ( bucketid, wordid, times ) values ( ?, ?, ? );' );     # PROFILE BLOCK STOP
941
942     $self->{db_get_bucket_unique_counts__} = $self->{db__}->prepare(                    # PROFILE BLOCK START
943              'select count(matrix.wordid), buckets.name from matrix, buckets
944                   where buckets.userid = ?
945                     and matrix.bucketid = buckets.id
946                   group by buckets.name;' );                                            # PROFILE BLOCK STOP
947
948     $self->{db_get_bucket_word_counts__} = $self->{db__}->prepare(                      # PROFILE BLOCK START
949              'select sum(matrix.times), buckets.name from matrix, buckets
950                   where matrix.bucketid = buckets.id
951                     and buckets.userid = ?
952                     group by buckets.name;' );                                          # PROFILE BLOCK STOP
953
954     $self->{db_get_unique_word_count__} = $self->{db__}->prepare(                       # PROFILE BLOCK START
955              'select count(matrix.wordid) from matrix, buckets
956                   where matrix.bucketid = buckets.id and
957                         buckets.userid = ?;' );                                         # PROFILE BLOCK STOP
958
959     $self->{db_get_full_total__} = $self->{db__}->prepare(                              # PROFILE BLOCK START
960              'select sum(matrix.times) from matrix, buckets
961                   where buckets.userid = ? and
962                         matrix.bucketid = buckets.id;' );                               # PROFILE BLOCK STOP
963
964     $self->{db_get_bucket_parameter__} = $self->{db__}->prepare(                        # PROFILE BLOCK START
965              'select bucket_params.val from bucket_params
966                   where bucket_params.bucketid = ? and
967                         bucket_params.btid = ?;' );                                     # PROFILE BLOCK STOP
968
969     $self->{db_set_bucket_parameter__} = $self->{db__}->prepare(                        # PROFILE BLOCK START
970            'replace into bucket_params ( bucketid, btid, val ) values ( ?, ?, ? );' );  # PROFILE BLOCK STOP
971
972     $self->{db_get_bucket_parameter_default__} = $self->{db__}->prepare(                # PROFILE BLOCK START
973              'select bucket_template.def from bucket_template
974                   where bucket_template.id = ?;' );                                     # PROFILE BLOCK STOP
975
976     $self->{db_get_buckets_with_magnets__} = $self->{db__}->prepare(                    # PROFILE BLOCK START
977              'select buckets.name from buckets, magnets
978                   where buckets.userid = ? and
979                         magnets.id != 0 and
980                         magnets.bucketid = buckets.id group by buckets.name order by buckets.name;' );
981                                                                                         # PROFILE BLOCK STOP
982     $self->{db_delete_zero_words__} = $self->{db__}->prepare(                           # PROFILE BLOCK START
983              'delete from matrix
984                   where matrix.times = 0
985                     and matrix.bucketid = ?;' );                                        # PROFILE BLOCK STOP
986
987     # Get the mapping from parameter names to ids into a local hash
988
989     my $h = $self->validate_sql_prepare_and_execute( "select name, id from bucket_template;" );
990     while ( my $row = $h->fetchrow_arrayref ) {
991         $self->{db_parameterid__}{$row->[0]} = $row->[1];
992     }
993     $h->finish;
994
995     return 1;
996 }
997
998 #----------------------------------------------------------------------------
999 #
1000 # insert_schema__
1001 #
1002 # Insert the POPFile schema in a database
1003 #
1004 # $sqlite          Set to 1 if this is a SQLite database
1005 #
1006 #----------------------------------------------------------------------------
1007 sub insert_schema__
1008 {
1009     my ( $self, $sqlite ) = @_;
1010
1011     if ( -e $self->get_root_path_( 'Classifier/popfile.sql' ) ) {
1012         my $schema = '';
1013
1014         $self->log_( 0, "Creating database schema" );
1015
1016         open SCHEMA, '<' . $self->get_root_path_( 'Classifier/popfile.sql' );
1017         while ( <SCHEMA> ) {
1018             next if ( /^--/ );
1019             next if ( !/[a-z;]/ );
1020             s/--.*$//;
1021
1022             # If the line begins 'alter' and we are doing SQLite then ignore
1023             # the line
1024
1025             if ( $sqlite && ( /^alter/i ) ) {
1026                 next;
1027             }
1028
1029             $schema .= $_;
1030
1031             if ( ( /end;/ ) || ( /\);/ ) || ( /^alter/i ) ) {
1032                 $self->{db__}->do( $schema );
1033                 $schema = '';
1034             }
1035         }
1036         close SCHEMA;
1037         return 1;
1038     } else {
1039         $self->log_( 0, "Can't find the database schema" );
1040         return 0;
1041     }
1042 }
1043
1044 #----------------------------------------------------------------------------
1045 #
1046 # db_upgrade__
1047 #
1048 # Upgrade the POPFile schema / Convert the database
1049 #
1050 # $db_from         Database handle convert from
1051 #                  undef if upgrade POPFile schema
1052 #
1053 #----------------------------------------------------------------------------
1054 sub db_upgrade__
1055 {
1056     my ( $self, $db_from ) = @_;
1057
1058     my $drop_table;
1059
1060     if ( !defined( $db_from ) ) {
1061         # Upgrade
1062
1063         $drop_table = 1;
1064         $db_from = $self->{db__};
1065     }
1066
1067     my $from_sqlite = ( $db_from->{Driver}->{Name} =~ /SQLite/ );
1068     my $to_sqlite = ( $self->{db__}->{Driver}->{Name} =~ /SQLite/ );
1069
1070     my $sqlquotechar = $db_from->get_info(29) || '';
1071     my @tables = map { s/$sqlquotechar//g; $_ } ($db_from->tables());
1072
1073     # We are going to dump out all the data in the database as
1074     # INSERT OR IGNORE statements in a temporary file, then DROP all
1075     # the tables in the database, then recreate the schema from the
1076     # new schema and finally rerun the inserts.
1077
1078     my $i = 0;
1079     my $ins_file = $self->get_user_path_( 'insert.sql' );
1080     open INSERT, '>' . $ins_file;
1081
1082     foreach my $table (@tables) {
1083         next if ( $table =~ /\.?popfile$/ );
1084         if ( $from_sqlite && ( $table =~ /^sqlite_/ ) ) {
1085             next;
1086         }
1087         if ( $i > 99 ) {
1088             print "\n";
1089         }
1090         print "    Saving table $table\n    ";
1091
1092         my $t = $db_from->prepare( "select * from $table;" );
1093         $t->execute;
1094         $i = 0;
1095         while ( 1 ) {
1096             if ( ( ++$i % 100 ) == 0 ) {
1097                 print "[$i]";
1098                 flush STDOUT;
1099             }
1100             if ( ( $i % 1000 ) == 0 ) {
1101                 print "\n";
1102                 flush STDOUT;
1103             }
1104             my $rows = $t->fetchrow_arrayref;
1105
1106             last if ( !defined( $rows ) );
1107
1108             if ( $to_sqlite ) {
1109                 print INSERT "INSERT OR IGNORE INTO $table (";
1110             } else {
1111                 print INSERT "INSERT INTO $table (";
1112             }
1113             for my $i (0..$t->{NUM_OF_FIELDS}-1) {
1114                 if ( $i != 0 ) {
1115                     print INSERT ',';
1116                 }
1117                 print INSERT $t->{NAME}->[$i];
1118             }
1119             print INSERT ') VALUES (';
1120             for my $i (0..$t->{NUM_OF_FIELDS}-1) {
1121                 if ( $i != 0 ) {
1122                     print INSERT ',';
1123                 }
1124                 my $val = $rows->[$i];
1125                 if ( $t->{TYPE}->[$i] !~ /^int/i ) {
1126                     $val = '' if ( !defined( $val ) );
1127                     $val = $self->db_quote( $val );
1128                 } else {
1129                     $val = 'NULL' if ( !defined( $val ) );
1130                 }
1131                 print INSERT $val;
1132             }
1133             print INSERT ");\n";
1134         }
1135         $t->finish;
1136     }
1137
1138     close INSERT;
1139
1140     if ( $i > 99 ) {
1141         print "\n";
1142     }
1143
1144     if ( $drop_table ) {
1145         foreach my $table (@tables) {
1146             if ( $from_sqlite && ( $table =~ /^sqlite_/ ) ) {
1147                 next;
1148             }
1149             print "    Dropping old table $table\n";
1150             $self->{db__}->do( "DROP TABLE $table;" );
1151         }
1152     }
1153
1154     print "    Inserting new database schema\n";
1155     if ( !$self->insert_schema__( $to_sqlite ) ) {
1156         return 0;
1157     }
1158
1159     print "    Restoring old data\n    ";
1160
1161     $self->{db__}->begin_work;
1162     open INSERT, '<' . $ins_file;
1163     $i = 0;
1164     while ( <INSERT> ) {
1165         if ( ( ++$i % 100 ) == 0 ) {
1166            print "[$i]";
1167            flush STDOUT;
1168         }
1169         if ( ( $i % 1000 ) == 0 ) {
1170             print "\n";
1171             flush STDOUT;
1172         }
1173         s/[\r\n]//g;
1174         $self->{db__}->do( $_ );
1175     }
1176     close INSERT;
1177     $self->{db__}->commit;
1178
1179     unlink $ins_file;
1180 }
1181
1182 #----------------------------------------------------------------------------
1183 #
1184 # db_disconnect__
1185 #
1186 # Disconnect from the POPFile database
1187 #
1188 #----------------------------------------------------------------------------
1189 sub db_disconnect__
1190 {
1191     my ( $self ) = @_;
1192
1193     $self->{db_get_buckets__}->finish;
1194     $self->{db_get_wordid__}->finish;
1195     $self->{db_get_userid__}->finish;
1196     $self->{db_get_word_count__}->finish;
1197     $self->{db_put_word_count__}->finish;
1198     $self->{db_get_bucket_unique_counts__}->finish;
1199     $self->{db_get_bucket_word_counts__}->finish;
1200     $self->{db_get_unique_word_count__}->finish;
1201     $self->{db_get_full_total__}->finish;
1202     $self->{db_get_bucket_parameter__}->finish;
1203     $self->{db_set_bucket_parameter__}->finish;
1204     $self->{db_get_bucket_parameter_default__}->finish;
1205     $self->{db_get_buckets_with_magnets__}->finish;
1206     $self->{db_delete_zero_words__}->finish;
1207
1208     # Avoid DBD::SQLite 'closing dbh with active statement handles' bug
1209
1210     undef $self->{db_get_buckets__};
1211     undef $self->{db_get_wordid__};
1212     undef $self->{db_get_userid__};
1213     undef $self->{db_get_word_count__};
1214     undef $self->{db_put_word_count__};
1215     undef $self->{db_get_bucket_unique_counts__};
1216     undef $self->{db_get_bucket_word_counts__};
1217     undef $self->{db_get_unique_word_count__};
1218     undef $self->{db_get_full_total__};
1219     undef $self->{db_get_bucket_parameter__};
1220     undef $self->{db_set_bucket_parameter__};
1221     undef $self->{db_get_bucket_parameter_default__};
1222     undef $self->{db_get_buckets_with_magnets__};
1223     undef $self->{db_delete_zero_words__};
1224
1225     if ( defined( $self->{db__} ) ) {
1226         $self->{db__}->disconnect;
1227         undef $self->{db__};
1228     }
1229 }
1230
1231 #----------------------------------------------------------------------------
1232 #
1233 # db_update_cache__
1234 #
1235 # Updates our local cache of user and bucket ids.
1236 #
1237 # $session           Must be a valid session
1238 #
1239 #----------------------------------------------------------------------------
1240 sub db_update_cache__
1241 {
1242     my ( $self, $session ) = @_;
1243
1244     my $userid = $self->valid_session_key__( $session );
1245     return undef if ( !defined( $userid ) );
1246
1247     delete $self->{db_bucketid__}{$userid};
1248
1249     $self->validate_sql_prepare_and_execute( $self->{db_get_buckets__}, $userid );
1250     while ( my $row = $self->{db_get_buckets__}->fetchrow_arrayref ) {
1251         $self->{db_bucketid__}{$userid}{$row->[0]}{id} = $row->[1];
1252         $self->{db_bucketid__}{$userid}{$row->[0]}{pseudo} = $row->[2];
1253         $self->{db_bucketcount__}{$userid}{$row->[0]} = 0;
1254     }
1255
1256     $self->validate_sql_prepare_and_execute( $self->{db_get_bucket_word_counts__}, $userid );
1257
1258     for my $b (sort keys %{$self->{db_bucketid__}{$userid}}) {
1259         $self->{db_bucketcount__}{$userid}{$b} = 0;
1260         $self->{db_bucketunique__}{$userid}{$b} = 0;
1261     }
1262
1263     while ( my $row = $self->{db_get_bucket_word_counts__}->fetchrow_arrayref ) {
1264         $self->{db_bucketcount__}{$userid}{$row->[1]} = $row->[0];
1265     }
1266
1267     $self->validate_sql_prepare_and_execute( $self->{db_get_bucket_unique_counts__}, $userid );
1268
1269     while ( my $row = $self->{db_get_bucket_unique_counts__}->fetchrow_arrayref ) {
1270         $self->{db_bucketunique__}{$userid}{$row->[1]} = $row->[0];
1271     }
1272
1273     $self->update_constants__( $session );
1274 }
1275
1276 #----------------------------------------------------------------------------
1277 #
1278 # db_get_word_count__
1279 #
1280 # Return the 'count' value for a word in a bucket.  If the word is not
1281 # found in that bucket then returns undef.
1282 #
1283 # $session          Valid session ID from get_session_key
1284 # $bucket           bucket word is in
1285 # $word             word to lookup
1286 #
1287 #----------------------------------------------------------------------------
1288 sub db_get_word_count__
1289 {
1290     my ( $self, $session, $bucket, $word ) = @_;
1291
1292     my $userid = $self->valid_session_key__( $session );
1293     return undef if ( !defined( $userid ) );
1294
1295     $self->validate_sql_prepare_and_execute( $self->{db_get_wordid__}, $word );
1296     my $result = $self->{db_get_wordid__}->fetchrow_arrayref;
1297     if ( !defined( $result ) ) {
1298         return undef;
1299     }
1300
1301     my $wordid = $result->[0];
1302
1303     $self->validate_sql_prepare_and_execute( $self->{db_get_word_count__}, $self->{db_bucketid__}{$userid}{$bucket}{id}, $wordid );
1304     $result = $self->{db_get_word_count__}->fetchrow_arrayref;
1305     if ( defined( $result ) ) {
1306          return $result->[0];
1307     } else {
1308          return undef;
1309     }
1310 }
1311
1312 #----------------------------------------------------------------------------
1313 #
1314 # db_put_word_count__
1315 #
1316 # Update 'count' value for a word in a bucket, if the update fails
1317 # then returns 0 otherwise is returns 1
1318 #
1319 # $session          Valid session ID from get_session_key
1320 # $bucket           bucket word is in
1321 # $word             word to update
1322 # $count            new count value
1323 #
1324 #----------------------------------------------------------------------------
1325 sub db_put_word_count__
1326 {
1327     my ( $self, $session, $bucket, $word, $count ) = @_;
1328
1329     my $userid = $self->valid_session_key__( $session );
1330     return undef if ( !defined( $userid ) );
1331
1332     # We need to have two things before we can start, the id of the
1333     # word in the words table (if there's none then we need to add the
1334     # word), the bucket id in the buckets table (which must exist)
1335
1336     $word = $self->db_quote($word);
1337
1338     my $result = $self->{db__}->selectrow_arrayref(
1339                      "select words.id from words where words.word = $word limit 1;");
1340
1341     if ( !defined( $result ) ) {
1342         $self->{db__}->do( "insert into words ( word ) values ( $word );" );
1343         $result = $self->{db__}->selectrow_arrayref(
1344                      "select words.id from words where words.word = $word limit 1;");
1345     }
1346
1347     my $wordid = $result->[0];
1348     my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
1349
1350     $self->validate_sql_prepare_and_execute( $self->{db_put_word_count__}, $bucketid, $wordid, $count );
1351
1352     return 1;
1353 }
1354
1355 #----------------------------------------------------------------------------
1356 #
1357 # upgrade_predatabase_data__
1358 #
1359 # Looks for old POPFile data (in flat files or BerkeleyDB tables) and
1360 # upgrades it to the SQL database.  Data upgraded is removed.
1361 #
1362 #----------------------------------------------------------------------------
1363 sub upgrade_predatabase_data__
1364 {
1365     my ( $self ) = @_;
1366     my $c      = 0;
1367
1368     # There's an assumption here that this is the single user version
1369     # of POPFile and hence what we do is cheat and get a session key
1370     # assuming that the user name is admin with password ''
1371
1372     my $session = $self->get_session_key( 'admin', '' );
1373
1374     if ( !defined( $session ) ) {
1375         $self->log_( 0, "Tried to get the session key for user admin and failed; cannot upgrade old data" );
1376         return;
1377     }
1378
1379     my @buckets = glob $self->get_user_path_( $self->config_( 'corpus' ) . '/*' );
1380
1381     foreach my $bucket (@buckets) {
1382
1383         # A bucket directory must be a directory
1384
1385         next unless ( -d $bucket );
1386         next unless ( ( -e "$bucket/table" ) || ( -e "$bucket/table.db" ) );
1387
1388         return 0 if ( !$self->upgrade_bucket__( $session, $bucket ) );
1389
1390         my $color = '';
1391
1392         # See if there's a color file specified
1393         if ( open COLOR, '<' . "$bucket/color" ) {
1394             $color = <COLOR>;
1395
1396             # Someone (who shall remain nameless) went in and manually created
1397             # empty color files in their corpus directories which would cause
1398             # $color at this point to be undefined and hence you'd get warnings
1399             # about undefined variables below.  So this little test is to deal
1400             # with that user and to make POPFile a little safer which is always
1401             # a good thing
1402
1403             if ( !defined( $color ) ) {
1404                 $color = '';
1405             } else {
1406                 $color =~ s/[\r\n]//g;
1407             }
1408             close COLOR;
1409             unlink "$bucket/color";
1410         }
1411
1412         $bucket =~ /([[:alpha:]0-9-_]+)$/;
1413         $bucket =  $1;
1414
1415         $self->set_bucket_color( $session, $bucket, ($color eq '')?$self->{possible_colors__}[$c]:$color );
1416
1417         $c = ($c+1) % ($#{$self->{possible_colors__}}+1);
1418     }
1419
1420     $self->release_session_key( $session );
1421
1422     return 1;
1423 }
1424
1425 #----------------------------------------------------------------------------
1426 #
1427 # upgrade_bucket__
1428 #
1429 # Loads an individual bucket
1430 #
1431 # $session           Valid session key from get_session_key
1432 # $bucket            The bucket name
1433 #
1434 #----------------------------------------------------------------------------
1435 sub upgrade_bucket__
1436 {
1437     my ( $self, $session, $bucket ) = @_;
1438
1439     $bucket =~ /([[:alpha:]0-9-_]+)$/;
1440     $bucket =  $1;
1441
1442     $self->create_bucket( $session, $bucket );
1443
1444     if ( open PARAMS, '<' . $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/params" ) ) {
1445         while ( <PARAMS> )  {
1446             s/[\r\n]//g;
1447             if ( /^([[:lower:]]+) ([^\r\n\t ]+)$/ )  {
1448                 $self->set_bucket_parameter( $session, $bucket, $1, $2 );
1449             }
1450         }
1451         close PARAMS;
1452         unlink $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/params" );
1453     }
1454
1455     # Pre v0.21.0 POPFile had GLOBAL parameters for subject modification,
1456     # XTC and XPL insertion.  To make the upgrade as clean as possible
1457     # check these parameters so that if they were OFF we set the equivalent
1458     # per bucket to off
1459
1460     foreach my $gl ( 'subject', 'xtc', 'xpl' ) {
1461         $self->log_( 1, "Checking deprecated parameter GLOBAL_$gl for $bucket\n" );
1462         my $val = $self->{configuration__}->deprecated_parameter( "GLOBAL_$gl" );
1463         if ( defined( $val ) && ( $val == 0 ) ) {
1464             $self->log_( 1, "GLOBAL_$gl is 0 for $bucket, overriding $gl\n" );
1465             $self->set_bucket_parameter( $session, $bucket, $gl, 0 );
1466         }
1467     }
1468
1469     # See if there are magnets defined
1470     if ( open MAGNETS, '<' . $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/magnets" ) ) {
1471         while ( <MAGNETS> )  {
1472             s/[\r\n]//g;
1473
1474             # Because of a bug in v0.17.9 and earlier of POPFile the text of
1475             # some magnets was getting mangled by certain characters having
1476             # a \ prepended.  Code here removes the \ in these cases to make
1477             # an upgrade smooth.
1478
1479             if ( /^([^ ]+) (.+)$/ )  {
1480                 my $type  = $1;
1481                 my $value = $2;
1482
1483                 # Some people were accidently creating magnets with
1484                 # trailing whitespace which really confused them later
1485                 # when their magnet did not match (see comment in
1486                 # UI::HTML::magnet for more detail)
1487
1488                 $value =~ s/^[ \t]+//g;
1489                 $value =~ s/[ \t]+$//g;
1490
1491                 $value =~ s/\\(\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.)/$1/g;
1492                 $self->create_magnet( $session, $bucket, $type, $value );
1493             } else {
1494
1495                 # This branch is used to catch the original magnets in an
1496                 # old version of POPFile that were just there for from
1497                 # addresses only
1498
1499                 if ( /^(.+)$/ ) {
1500                     my $value = $1;
1501                     $value =~ s/\\(\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.)/$1/g;
1502                     $self->create_magnet( $session, $bucket, 'from', $value );
1503                 }
1504             }
1505         }
1506         close MAGNETS;
1507         unlink $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/magnets" );
1508     }
1509
1510     # If there is no existing table but there is a table file (the old style
1511     # flat file used by POPFile for corpus storage) then create the new
1512     # database from it thus performing an automatic upgrade.
1513
1514     if ( -e $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table" ) ) {
1515         $self->log_( 0, "Performing automatic upgrade of $bucket corpus from flat file to DBI" );
1516
1517         $self->{db__}->begin_work;
1518
1519         if ( open WORDS, '<' . $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table" ) )  {
1520
1521             my $wc = 1;
1522
1523             my $first = <WORDS>;
1524             if ( defined( $first ) && ( $first =~ s/^__CORPUS__ __VERSION__ (\d+)// ) ) {
1525                 if ( $1 != $self->{corpus_version__} )  {
1526                     print STDERR "Incompatible corpus version in $bucket\n";
1527                     close WORDS;
1528                     $self->{db__}->rollback;
1529                     return 0;
1530                 } else {
1531                     $self->log_( 0, "Upgrading bucket $bucket..." );
1532
1533                     while ( <WORDS> ) {
1534                         if ( $wc % 100 == 0 ) {
1535                             $self->log_( 0, "$wc" );
1536                         }
1537                         $wc += 1;
1538                         s/[\r\n]//g;
1539
1540                         if ( /^([^\s]+) (\d+)$/ ) {
1541                             if ( $2 != 0 ) {
1542                                 $self->db_put_word_count__( $session, $bucket, $1, $2 );
1543                             }
1544                         } else {
1545                             $self->log_( 0, "Found entry in corpus for $bucket that looks wrong: \"$_\" (ignoring)" );
1546                         }
1547                     }
1548                 }
1549
1550                 if ( $wc > 1 ) {
1551                     $wc -= 1;
1552                     $self->log_( 0, "(completed $wc words)" );
1553                 }
1554                 close WORDS;
1555             } else {
1556                 close WORDS;
1557                 $self->{db__}->rollback;
1558                 unlink $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table" );
1559                 return 0;
1560             }
1561
1562             $self->{db__}->commit;
1563             unlink $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table" );
1564         }
1565     }
1566
1567     # Now check to see if there's a BerkeleyDB-style table
1568
1569     my $bdb_file = $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table.db" );
1570
1571     if ( -e $bdb_file ) {
1572         $self->log_( 0, "Performing automatic upgrade of $bucket corpus from BerkeleyDB to DBI" );
1573
1574         require BerkeleyDB;
1575
1576         my %h;
1577         tie %h, "BerkeleyDB::Hash", -Filename => $bdb_file;
1578
1579         $self->log_( 0, "Upgrading bucket $bucket..." );
1580         $self->{db__}->begin_work;
1581
1582         my $wc = 1;
1583
1584         for my $word (keys %h) {
1585             if ( $wc % 100 == 0 ) {
1586                 $self->log_( 0, "$wc" );
1587             }
1588
1589             next if ( $word =~ /__POPFILE__(LOG__TOTAL|TOTAL|UNIQUE)__/ );
1590
1591             $wc += 1;
1592             if ( $h{$word} != 0 ) {
1593                 $self->db_put_word_count__( $session, $bucket, $word, $h{$word} );
1594             }
1595         }
1596
1597         $wc -= 1;
1598         $self->log_( 0, "(completed $wc words)" );
1599         $self->{db__}->commit;
1600         untie %h;
1601         unlink $bdb_file;
1602     }
1603
1604     return 1;
1605 }
1606
1607 #----------------------------------------------------------------------------
1608 #
1609 # magnet_match_helper__
1610 #
1611 # Helper the determines if a specific string matches a certain magnet
1612 # type in a bucket, used by magnet_match_
1613 #
1614 # $session         Valid session from get_session_key
1615 # $match           The string to match
1616 # $bucket          The bucket to check
1617 # $type            The magnet type to check
1618 #
1619 #----------------------------------------------------------------------------
1620 sub magnet_match_helper__
1621 {
1622     my ( $self, $session, $match, $bucket, $type ) = @_;
1623
1624     my $userid = $self->valid_session_key__( $session );
1625     return undef if ( !defined( $userid ) );
1626
1627     $match = lc($match);
1628
1629     my @magnets;
1630
1631     my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
1632     my $h = $self->validate_sql_prepare_and_execute(                                           # PROFILE BLOCK START
1633         "select magnets.val, magnets.id from magnets, users, buckets, magnet_types
1634              where buckets.id = $bucketid and
1635                    magnets.id != 0 and
1636                    users.id = buckets.userid and
1637                    magnets.bucketid = buckets.id and
1638                    magnet_types.mtype = '$type' and
1639                    magnets.mtid = magnet_types.id order by magnets.val;" );   # PROFILE BLOCK STOP
1640     while ( my $row = $h->fetchrow_arrayref ) {
1641         push @magnets, [$row->[0], $row->[1]];
1642     }
1643     $h->finish;
1644
1645     foreach my $m (@magnets) {
1646         my ( $magnet, $id ) = @{$m};
1647         $magnet = lc($magnet);
1648
1649         for my $i (0..(length($match)-length($magnet))) {
1650             if ( substr( $match, $i, length($magnet)) eq $magnet ) {
1651                 $self->{magnet_used__}   = 1;
1652                 $self->{magnet_detail__} = $id;
1653
1654                 return 1;
1655             }
1656         }
1657     }
1658
1659     return 0;
1660 }
1661
1662 #----------------------------------------------------------------------------
1663 #
1664 # magnet_match__
1665 #
1666 # Helper the determines if a specific string matches a certain magnet
1667 # type in a bucket
1668 #
1669 # $session         Valid session from get_session_key
1670 # $match           The string to match
1671 # $bucket          The bucket to check
1672 # $type            The magnet type to check
1673 #
1674 #----------------------------------------------------------------------------
1675 sub magnet_match__
1676 {
1677     my ( $self, $session, $match, $bucket, $type ) = @_;
1678
1679     return $self->magnet_match_helper__( $session, $match, $bucket, $type );
1680 }
1681
1682 #----------------------------------------------------------------------------
1683 #
1684 # write_line__
1685 #
1686 # Writes a line to a file and parses it unless the classification is
1687 # already known
1688 #
1689 # $file         File handle for file to write line to
1690 # $line         The line to write
1691 # $class        (optional) The current classification
1692 #
1693 #----------------------------------------------------------------------------
1694 sub write_line__
1695 {
1696     my ( $self, $file, $line, $class ) = @_;
1697
1698     print $file $line if defined( $file );
1699
1700     if ( $class eq '' ) {
1701         $self->{parser__}->parse_line( $line );
1702     }
1703 }
1704
1705 #----------------------------------------------------------------------------
1706 #
1707 # add_words_to_bucket__
1708 #
1709 # Takes words previously parsed by the mail parser and adds/subtracts
1710 # them to/from a bucket, this is a helper used by
1711 # add_messages_to_bucket, remove_message_from_bucket
1712 #
1713 # $session        Valid session from get_session_key
1714 # $bucket         Bucket to add to
1715 # $subtract       Set to -1 means subtract the words, set to 1 means add
1716 #
1717 #----------------------------------------------------------------------------
1718 sub add_words_to_bucket__
1719 {
1720     my ( $self, $session, $bucket, $subtract ) = @_;
1721
1722     my $userid = $self->valid_session_key__( $session );
1723     return undef if ( !defined( $userid ) );
1724
1725     # Map the list of words to a list of counts currently in the database
1726     # then update those counts and write them back to the database.
1727
1728     my $words;
1729     $words = join( ',', map( $self->{db__}->quote( $_ ), (sort keys %{$self->{parser__}{words__}}) ) );
1730     $self->{get_wordids__} = $self->validate_sql_prepare_and_execute(        # PROFILE BLOCK START
1731              "select id, word
1732                   from words
1733                   where word in ( $words );" );             # PROFILE BLOCK STOP
1734
1735     my @id_list;
1736     my %wordmap;
1737
1738     while ( my $row = $self->{get_wordids__}->fetchrow_arrayref ) {
1739         push @id_list, ($row->[0]);
1740         $wordmap{$row->[1]} = $row->[0];
1741     }
1742
1743     $self->{get_wordids__}->finish;
1744     undef $self->{get_wordids__};
1745
1746     my $ids = join( ',', @id_list );
1747
1748     $self->{db_getwords__} = $self->validate_sql_prepare_and_execute(                                         # PROFILE BLOCK START
1749              "select matrix.times, matrix.wordid
1750                   from matrix
1751                   where matrix.wordid in ( $ids )
1752                     and matrix.bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};" );  # PROFILE BLOCK STOP
1753
1754     my %counts;
1755
1756     while ( my $row = $self->{db_getwords__}->fetchrow_arrayref ) {
1757         $counts{$row->[1]} = $row->[0];
1758     }
1759
1760     $self->{db_getwords__}->finish;
1761     undef $self->{db_getwords__};
1762
1763     $self->{db__}->begin_work;
1764     foreach my $word (keys %{$self->{parser__}->{words__}}) {
1765
1766         # If there's already a count then it means that the word is
1767         # already in the database and we have its id in
1768         # $wordmap{$word} so for speed we execute the
1769         # db_put_word_count__ query here rather than going through
1770         # set_value_ which would need to look up the wordid again
1771
1772         if ( defined( $wordmap{$word} ) && defined( $counts{$wordmap{$word}} ) ) {
1773             $self->validate_sql_prepare_and_execute( $self->{db_put_word_count__}, $self->{db_bucketid__}{$userid}{$bucket}{id},               # PROFILE BLOCK START
1774                 $wordmap{$word}, $counts{$wordmap{$word}} + $subtract * $self->{parser__}->{words__}{$word} ); # PROFILE BLOCK STOP
1775         } else {
1776
1777             # If the word is not in the database and we are trying to
1778             # subtract then we do nothing because negative values are
1779             # meaningless
1780
1781             if ( $subtract == 1 ) {
1782                 $self->db_put_word_count__( $session, $bucket, $word, $self->{parser__}->{words__}{$word} );
1783             }
1784         }
1785     }
1786
1787     # If we were doing a subtract operation it's possible that some of
1788     # the words in the bucket now have a zero count and should be
1789     # removed
1790
1791     if ( $subtract == -1 ) {
1792         $self->validate_sql_prepare_and_execute( $self->{db_delete_zero_words__}, $self->{db_bucketid__}{$userid}{$bucket}{id} );
1793     }
1794
1795     $self->{db__}->commit;
1796 }
1797
1798 #----------------------------------------------------------------------------
1799 #
1800 # echo_to_dot_
1801 #
1802 # $mail The stream (created with IO::) to send the message to (the
1803 # remote mail server)
1804 # $client (optional) The local mail client (created with IO::) that
1805 # needs the response
1806 # $file (optional) A file to print the response to, caller specifies
1807 # open style
1808 # $before (optional) String to send to client before the dot is sent
1809 #
1810 # echo all information from the $mail server until a single line with
1811 # a . is seen
1812 #
1813 # NOTE Also echoes the line with . to $client but not to $file
1814 #
1815 # Returns 1 if there was a . or 0 if reached EOF before we hit the .
1816 #
1817 #----------------------------------------------------------------------------
1818 sub echo_to_dot_
1819 {
1820     my ( $self, $mail, $client, $file, $before ) = @_;
1821
1822     my $hit_dot = 0;
1823
1824     my $isopen = open FILE, "$file" if ( defined( $file ) );
1825     binmode FILE if ($isopen);
1826
1827     while ( my $line = $self->slurp_( $mail ) ) {
1828
1829         # Check for an abort
1830
1831         last if ( $self->{alive_} == 0 );
1832
1833         # The termination has to be a single line with exactly a dot
1834         # on it and nothing else other than line termination
1835         # characters.  This is vital so that we do not mistake a line
1836         # beginning with . as the end of the block
1837
1838         if ( $line =~ /^\.(\r\n|\r|\n)$/ ) {
1839             $hit_dot = 1;
1840
1841             if ( defined( $before ) && ( $before ne '' ) ) {
1842                 print $client $before if ( defined( $client ) );
1843                 print FILE    $before if ( defined( $isopen ) );
1844             }
1845
1846             # Note that there is no print FILE here.  This is correct
1847             # because we do no want the network terminator . to appear
1848             # in the file version of any message
1849
1850             print $client $line if ( defined( $client ) );
1851             last;
1852         }
1853
1854         print $client $line if ( defined( $client ) );
1855         print FILE    $line if ( defined( $isopen ) );
1856
1857     }
1858
1859     close FILE if ( $isopen );
1860
1861     return $hit_dot;
1862 }
1863
1864 #----------------------------------------------------------------------------
1865 #
1866 # substr_euc__
1867 #
1868 # "substr" function which supports EUC Japanese charset
1869 #
1870 # $pos      Start position
1871 # $len      Word length
1872 #
1873 #----------------------------------------------------------------------------
1874 sub substr_euc__
1875 {
1876     my ( $str, $pos, $len ) = @_;
1877     my $result_str;
1878     my $char;
1879     my $count = 0;
1880     if ( !$pos ) {
1881         $pos = 0;
1882     }
1883     if ( !$len ) {
1884         $len = length( $str );
1885     }
1886
1887     for ( $pos = 0; $count < $len; $pos++ ) {
1888         $char = substr( $str, $pos, 1 );
1889         if ( $char =~ /[\x80-\xff]/ ) {
1890             $char = substr( $str, $pos++, 2 );
1891         }
1892         $result_str .= $char;
1893         $count++;
1894     }
1895
1896     return $result_str;
1897 }
1898
1899 #----------------------------------------------------------------------------
1900 #
1901 # generate_unique_session_key__
1902 #
1903 # Returns a unique string based session key that can be used as a key
1904 # in the api_sessions__
1905 #
1906 #----------------------------------------------------------------------------
1907 sub generate_unique_session_key__
1908 {
1909     my ( $self ) = @_;
1910
1911     my @chars = ( 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',   # PROFILE BLOCK START
1912                   'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'U', 'V', 'W', 'X', 'Y',
1913                   'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A' ); # PROFILE BLOCK STOP
1914
1915     my $session;
1916
1917     do {
1918         $session = '';
1919         my $length = int( 16 + rand(4) );
1920
1921         for my $i (0 .. $length) {
1922             my $random = $chars[int( rand(36) )];
1923
1924             # Just to add spice to things we sometimes lowercase the value
1925
1926             if ( rand(1) < rand(1) ) {
1927                 $random = lc($random);
1928             }
1929
1930             $session .= $random;
1931         }
1932     } while ( defined( $self->{api_sessions__}{$session} ) );
1933
1934     return $session;
1935 }
1936
1937 #----------------------------------------------------------------------------
1938 #
1939 # release_session_key_private__
1940 #
1941 # $session        A session key previously returned by get_session_key
1942 #
1943 # Releases and invalidates the session key. Worker function that does the work
1944 # of release_session_key.
1945 #                   ****DO NOT CALL DIRECTLY****
1946 # unless you want your session key released immediately, possibly preventing
1947 # asynchronous tasks from completing
1948 #
1949 #----------------------------------------------------------------------------
1950 sub release_session_key_private__
1951 {
1952     my ( $self, $session ) = @_;
1953
1954     if ( defined( $self->{api_sessions__}{$session} ) ) {
1955         $self->log_( 1, "release_session_key releasing key $session for user $self->{api_sessions__}{$session}" );
1956         delete $self->{api_sessions__}{$session};
1957     }
1958 }
1959
1960 #----------------------------------------------------------------------------
1961 #
1962 # valid_session_key__
1963 #
1964 # $session                Session key returned by call to get_session_key
1965 #
1966 # Returns undef is the session key is not valid, or returns the user
1967 # ID associated with the session key which can be used in database
1968 # accesses
1969 #
1970 #----------------------------------------------------------------------------
1971 sub valid_session_key__
1972 {
1973     my ( $self, $session ) = @_;
1974
1975     # This provides protection against someone using the XML-RPC
1976     # interface and calling this API directly to fish for session
1977     # keys, this must be called from within this module
1978
1979     return undef if ( caller ne 'Classifier::Bayes' );
1980
1981     # If the session key is invalid then wait 1 second.  This is done
1982     # to prevent people from calling a POPFile API such as
1983     # get_bucket_count with random session keys fishing for a valid
1984     # key.  The XML-RPC API is single threaded and hence this will
1985     # delay all use of that API by one second.  Of course in normal
1986     # use when the user knows the username/password or session key
1987     # then there is no delay
1988
1989     if ( !defined( $self->{api_sessions__}{$session} ) ) {
1990         my ( $package, $filename, $line, $subroutine ) = caller;
1991         $self->log_( 0, "Invalid session key $session provided in $package @ $line" );
1992         select( undef, undef, undef, 1 );
1993     }
1994
1995     return $self->{api_sessions__}{$session};
1996 }
1997
1998 #----------------------------------------------------------------------------
1999 #----------------------------------------------------------------------------
2000 # _____   _____   _____  _______ _____        _______   _______  _____  _____
2001 #|_____] |     | |_____] |______   |   |      |______   |_____| |_____]   |
2002 #|       |_____| |       |       __|__ |_____ |______   |     | |       __|__
2003 #
2004 # The method below are public and may be accessed by other modules.
2005 # All of them may be accessed remotely through the XMLRPC.pm module
2006 # using the XML-RPC protocol
2007 #
2008 # Note that every API function expects to be passed a $session which
2009 # is obtained by first calling get_session_key with a valid username
2010 # and password.  Once done call the method release_session_key.
2011 #
2012 # See POPFile::API for more details
2013 #
2014 #----------------------------------------------------------------------------
2015 #----------------------------------------------------------------------------
2016
2017 #----------------------------------------------------------------------------
2018 #
2019 # get_session_key
2020 #
2021 # $user           The name of an existing user
2022 # $pwd            The user's password
2023 #
2024 # Returns a string based session key if the username and password
2025 # match, or undef if not
2026 #
2027 #----------------------------------------------------------------------------
2028 sub get_session_key
2029 {
2030     my ( $self, $user, $pwd ) = @_;
2031
2032     # The password is stored in the database as an MD5 hash of the
2033     # username and password concatenated and separated by the string
2034     # __popfile__, so compute the hash here
2035
2036     my $hash = md5_hex( $user . '__popfile__' . $pwd );
2037
2038     $self->validate_sql_prepare_and_execute( $self->{db_get_userid__}, $user, $hash );
2039     my $result = $self->{db_get_userid__}->fetchrow_arrayref;
2040     if ( !defined( $result ) ) {
2041
2042         # The delay of one second here is to prevent people from trying out
2043         # username/password combinations at high speed to determine the
2044         # credentials of a valid user
2045
2046         $self->log_( 0, "Attempt to login with incorrect credentials for user $user" );
2047         select( undef, undef, undef, 1 );
2048         return undef;
2049     }
2050
2051     my $session = $self->generate_unique_session_key__();
2052
2053     $self->{api_sessions__}{$session} = $result->[0];
2054
2055     $self->db_update_cache__( $session );
2056
2057     $self->log_( 1, "get_session_key returning key $session for user $self->{api_sessions__}{$session}" );
2058
2059     return $session;
2060 }
2061
2062 #----------------------------------------------------------------------------
2063 #
2064 # release_session_key
2065 #
2066 # $session        A session key previously returned by get_session_key
2067 #
2068 # Releases and invalidates the session key
2069 #
2070 #----------------------------------------------------------------------------
2071 sub release_session_key
2072 {
2073     my ( $self, $session ) = @_;
2074
2075     $self->mq_post_( "RELSE", $session );
2076 }
2077
2078
2079 #----------------------------------------------------------------------------
2080 #
2081 # get_top_bucket__
2082 #
2083 # Helper function used by classify to get the bucket with the highest
2084 # score from data stored in a matrix of information (see definition of
2085 # %matrix in classify for details) and a list of potential buckets
2086 #
2087 # $userid         User ID for database access
2088 # $id             ID of a word in $matrix
2089 # $matrix         Reference to the %matrix hash in classify
2090 # $buckets        Reference to a list of buckets
2091 #
2092 # Returns the bucket in $buckets with the highest score
2093 #
2094 #----------------------------------------------------------------------------
2095 sub get_top_bucket__
2096 {
2097     my ( $self, $userid, $id, $matrix, $buckets ) = @_;
2098
2099     my $best_probability = 0;
2100     my $top_bucket       = 'unclassified';
2101
2102     for my $bucket (@$buckets) {
2103         my $probability = 0;
2104         if ( defined($$matrix{$id}{$bucket}) && ( $$matrix{$id}{$bucket} > 0 ) ) {
2105             $probability = $$matrix{$id}{$bucket} / $self->{db_bucketcount__}{$userid}{$bucket};
2106         }
2107
2108         if ( $probability > $best_probability ) {
2109             $best_probability = $probability;
2110             $top_bucket       = $bucket;
2111         }
2112     }
2113
2114     return $top_bucket;
2115 }
2116
2117 #----------------------------------------------------------------------------
2118 #
2119 # classify
2120 #
2121 # $session   A valid session key returned by a call to get_session_key
2122 # $file The name of the file containing the text to classify (or undef
2123 # to use the data already in the parser)
2124 # $templ     Reference to the UI template used for word score display
2125 # $matrix (optional) Reference to a hash that will be filled with the
2126 # word matrix used in classification
2127 # $idmap (optional) Reference to a hash that will map word ids in the
2128 # $matrix to actual words
2129 #
2130 # Splits the mail message into valid words, then runs the Bayes
2131 # algorithm to figure out which bucket it belongs in.  Returns the
2132 # bucket name
2133 #
2134 #----------------------------------------------------------------------------
2135 sub classify
2136 {
2137     my ( $self, $session, $file, $templ, $matrix, $idmap ) = @_;
2138     my $msg_total = 0;
2139
2140     my $userid = $self->valid_session_key__( $session );
2141     return undef if ( !defined( $userid ) );
2142
2143     $self->{unclassified__} = log( $self->config_( 'unclassified_weight' ) );
2144
2145     $self->{magnet_used__}   = 0;
2146     $self->{magnet_detail__} = 0;
2147
2148     if ( defined( $file ) ) {
2149         $self->{parser__}->parse_file( $file,                                           # PROFILE BLOCK START
2150                                        $self->global_config_( 'message_cutoff'   ) );   # PROFILE BLOCK STOP
2151     }
2152
2153     # Get the list of buckets
2154
2155     my @buckets = $self->get_buckets( $session );
2156
2157     # If the user has not defined any buckets then we escape here
2158     # return unclassified
2159
2160     return "unclassified" if ( $#buckets == -1 );
2161
2162     # Check to see if this email should be classified based on a magnet
2163
2164     for my $bucket ($self->get_buckets_with_magnets( $session ))  {
2165         for my $type ($self->get_magnet_types_in_bucket( $session, $bucket )) {
2166             if ( $self->magnet_match__( $session, $self->{parser__}->get_header($type), $bucket, $type ) ) {
2167                 return $bucket;
2168             }
2169         }
2170     }
2171
2172     # The score hash will contain the likelihood that the given
2173     # message is in each bucket, the buckets are the keys for score
2174
2175     # Set up the initial score as P(bucket)
2176
2177     my %score;
2178     my %matchcount;
2179
2180     # Build up a list of the buckets that are OK to use for
2181     # classification (i.e.  that have at least one word in them).
2182
2183     my @ok_buckets;
2184
2185     for my $bucket (@buckets) {
2186         if ( $self->{bucket_start__}{$userid}{$bucket} != 0 ) {
2187             $score{$bucket} = $self->{bucket_start__}{$userid}{$bucket};
2188             $matchcount{$bucket} = 0;
2189             push @ok_buckets, ( $bucket );
2190         }
2191     }
2192
2193     @buckets = @ok_buckets;
2194
2195     # If the user does not have at least two buckets which contains
2196     # some words then we escape here return unclassified
2197
2198     return "unclassified" if ( $#buckets < 1 );
2199
2200     # For each word go through the buckets and calculate
2201     # P(word|bucket) and then calculate P(word|bucket) ^ word count
2202     # and multiply to the score
2203
2204     my $word_count = 0;
2205
2206     # The correction value is used to generate score displays variable
2207     # which are consistent with the word scores shown by the GUI's
2208     # word lookup feature.  It is computed to make the contribution of
2209     # a word which is unrepresented in a bucket zero.  This correction
2210     # affects only the values displayed in the display; it has no
2211     # effect on the classification process.
2212
2213     my $correction = 0;
2214
2215     # Classification against the database works in a sequence of steps
2216     # to get the fastest time possible.  The steps are as follows:
2217     #
2218     # 1. Convert the list of words returned by the parser into a list
2219     #    of unique word ids that can be used in the database.  This
2220     #    requires a select against the database to get the word ids
2221     #    (and associated words) which is then converted into two
2222     #    things: @id_list which is just the sorted list of word ids
2223     #    and %idmap which maps a word to its id.
2224     #
2225     # 2. Then run a second select that get the triplet (count, id,
2226     #    bucket) for each word id and each bucket.  The triplet
2227     #    contains the word count from the database for each bucket and
2228     #    each id, where there is an entry. That data gets loaded into
2229     #    the sparse matrix %matrix.
2230     #
2231     # 3. Do the normal classification loop as before running against
2232     # the @id_list for the words and for each bucket.  If there's an
2233     # entry in %matrix for the id/bucket combination then calculate
2234     # the probability, otherwise use the not_likely probability.
2235     #
2236     # NOTE.  Since there is a single not_likely probability we do not
2237     # worry about the fact that the select in 1 might return a shorter
2238     # list of words than was found in the message (because some words
2239     # are not in the database) since the missing words will be the
2240     # same for all buckets and hence constitute a fixed scaling factor
2241     # on all the buckets which is irrelevant in deciding which the
2242     # winning bucket is.
2243
2244     my $words;
2245     $words = join( ',', map( $self->{db__}->quote( $_ ), (sort keys %{$self->{parser__}{words__}}) ) );
2246     $self->{get_wordids__} = $self->validate_sql_prepare_and_execute(  # PROFILE BLOCK START
2247              "select id, word
2248                   from words
2249                   where word in ( $words )
2250                   order by id;" );                    # PROFILE BLOCK STOP
2251
2252     my @id_list;
2253     my %temp_idmap;
2254
2255     if ( !defined( $idmap ) ) {
2256         $idmap = \%temp_idmap;
2257     }
2258
2259     while ( my $row = $self->{get_wordids__}->fetchrow_arrayref ) {
2260         push @id_list, ($row->[0]);
2261         $$idmap{$row->[0]} = $row->[1];
2262     }
2263
2264     $self->{get_wordids__}->finish;
2265     undef $self->{get_wordids__};
2266
2267     my $ids = join( ',', @id_list );
2268
2269     $self->{db_classify__} = $self->validate_sql_prepare_and_execute(            # PROFILE BLOCK START
2270              "select matrix.times, matrix.wordid, buckets.name
2271                   from matrix, buckets
2272                   where matrix.wordid in ( $ids )
2273                     and matrix.bucketid = buckets.id
2274                     and buckets.userid = $userid;" );           # PROFILE BLOCK STOP
2275
2276     # %matrix maps wordids and bucket names to counts
2277     # $matrix{$wordid}{$bucket} == $count
2278
2279     my %temp_matrix;
2280
2281     if ( !defined( $matrix ) ) {
2282         $matrix = \%temp_matrix;
2283     }
2284
2285     while ( my $row = $self->{db_classify__}->fetchrow_arrayref ) {
2286         $$matrix{$row->[1]}{$row->[2]} = $row->[0];
2287     }
2288
2289     $self->{db_classify__}->finish;
2290     undef $self->{db_classify__};
2291
2292     foreach my $id (@id_list) {
2293         $word_count += 2;
2294         my $wmax = -10000;
2295
2296         foreach my $bucket (@buckets) {
2297             my $probability = 0;
2298
2299             if ( defined($$matrix{$id}{$bucket}) && ( $$matrix{$id}{$bucket} > 0 ) ) {
2300                 $probability = log( $$matrix{$id}{$bucket} / $self->{db_bucketcount__}{$userid}{$bucket} );
2301             }
2302
2303             $matchcount{$bucket} += $self->{parser__}{words__}{$$idmap{$id}} if ($probability != 0);
2304             $probability = $self->{not_likely__}{$userid} if ( $probability == 0 );
2305             $wmax = $probability if ( $wmax < $probability );
2306             $score{$bucket} += ( $probability * $self->{parser__}{words__}{$$idmap{$id}} );
2307         }
2308
2309         if ($wmax > $self->{not_likely__}{$userid}) {
2310             $correction += $self->{not_likely__}{$userid} * $self->{parser__}{words__}{$$idmap{$id}};
2311         } else {
2312             $correction += $wmax * $self->{parser__}{words__}{$$idmap{$id}};
2313         }
2314     }
2315
2316     # Now sort the scores to find the highest and return that bucket
2317     # as the classification
2318
2319     my @ranking = sort {$score{$b} <=> $score{$a}} keys %score;
2320
2321     my %raw_score;
2322     my $base_score = $score{$ranking[0]};
2323     my $total = 0;
2324
2325     # If the first and second bucket are too close in their
2326     # probabilities, call the message unclassified.  Also if there are
2327     # fewer than 2 buckets.
2328
2329     my $class = 'unclassified';
2330
2331     if ( @buckets > 1 && $score{$ranking[0]} > ( $score{$ranking[1]} + $self->{unclassified__} ) ) {
2332         $class = $ranking[0];
2333     }
2334
2335     # Compute the total of all the scores to generate the normalized
2336     # scores and probability estimate.  $total is always 1 after the
2337     # first loop iteration, so any additional term less than 2 ** -54
2338     # is insignificant, and need not be computed.
2339
2340     my $ln2p_54 = -54 * log(2);
2341
2342     foreach my $b (@ranking) {
2343         $raw_score{$b} = $score{$b};
2344         $score{$b} -= $base_score;
2345
2346         $total += exp($score{$b}) if ($score{$b} > $ln2p_54 );
2347     }
2348
2349     if ($self->{wordscores__} && defined($templ) ) {
2350         my %qm = %{$self->{parser__}->quickmagnets()};
2351         my $mlen = scalar(keys %{$self->{parser__}->quickmagnets()});
2352
2353         if ( $mlen > 0 ) {
2354             $templ->param( 'View_QuickMagnets_If' => 1 );
2355             $templ->param( 'View_QuickMagnets_Count' => ($mlen + 1) );
2356             my @buckets = $self->get_buckets( $session );
2357             my $i = 0;
2358             my %types = $self->get_magnet_types( $session );
2359
2360             my @bucket_data;
2361             foreach my $bucket (@buckets) {
2362                 my %row_data;
2363                 $row_data{View_QuickMagnets_Bucket} = $bucket;
2364                 $row_data{View_QuickMagnets_Bucket_Color} = $self->get_bucket_color( $session, $bucket );
2365                 push ( @bucket_data, \%row_data );
2366             }
2367
2368             my @qm_data;
2369             foreach my $type (sort keys %types) {