Ticket #92: Bayes.pm

File Bayes.pm, 139.7 kB (added by amatubu, 3 years ago)

Replacement of Bayes.pm

Line 
1 # POPFILE LOADABLE MODULE
2 package Classifier::Bayes;
3
4 use POPFile::Module;
5 @ISA = ("POPFile::Module");
6
7 #----------------------------------------------------------------------------
8 #
9 # Bayes.pm --- Naive Bayes text classifier
10 #
11 # Copyright (c) 2001-2008 John Graham-Cumming
12 #
13 #   This file is part of POPFile
14 #
15 #   POPFile is free software; you can redistribute it and/or modify it
16 #   under the terms of version 2 of the GNU General Public License as
17 #   published by the Free Software Foundation.
18 #
19 #   POPFile is distributed in the hope that it will be useful,
20 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
21 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 #   GNU General Public License for more details.
23 #
24 #   You should have received a copy of the GNU General Public License
25 #   along with POPFile; if not, write to the Free Software
26 #   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 #
28 #   Modified by              Sam Schinke    (sschinke@users.sourceforge.net)
29 #   Merged with db code from Scott Leighton (helphand@users.sourceforge.net)
30 #
31 #----------------------------------------------------------------------------
32
33 use strict;
34 use warnings;
35 use locale;
36 use Classifier::MailParse;
37 use IO::Handle;
38 use DBI;
39 use Digest::MD5 qw( md5_hex );
40 use MIME::Base64;
41 use File::Copy;
42
43 # This is used to get the hostname of the current machine
44 # in a cross platform way
45
46 use Sys::Hostname;
47
48 # A handy variable containing the value of an EOL for networks
49
50 my $eol = "\015\012";
51
52 # Korean characters definition
53
54 my $ksc5601_sym = '(?:[\xA1-\xAC][\xA1-\xFE])';
55 my $ksc5601_han = '(?:[\xB0-\xC8][\xA1-\xFE])';
56 my $ksc5601_hanja  = '(?:[\xCA-\xFD][\xA1-\xFE])';
57 my $ksc5601 = "(?:$ksc5601_sym|$ksc5601_han|$ksc5601_hanja)";
58
59 my $eksc = "(?:$ksc5601|[\x81-\xC6][\x41-\xFE])"; #extended ksc
60
61 #----------------------------------------------------------------------------
62 # new
63 #
64 #   Class new() function
65 #----------------------------------------------------------------------------
66 sub new
67 {
68     my $type = shift;
69     my $self = POPFile::Module->new();
70
71     # Set this to 1 to get scores for individual words in message detail
72
73     $self->{wordscores__}        = 0;
74
75     # Choice for the format of the "word matrix" display.
76
77     $self->{wmformat__}          = '';
78
79     # Just our hostname
80
81     $self->{hostname__}        = '';
82
83     # File Handle for DBI database
84
85     $self->{db__}                = {};
86
87     $self->{history__}        = 0;
88
89     # To save time we also 'prepare' some commonly used SQL statements
90     # and cache them here, see the function db_connect__ for details
91
92     $self->{db_get_buckets__} = 0;
93     $self->{db_get_wordid__} = 0;
94     $self->{db_get_word_count__} = 0;
95     $self->{db_put_word_count__} = 0;
96     $self->{db_get_bucket_unique_counts__} = 0;
97     $self->{db_get_unique_word_count__} = 0;
98     $self->{db_get_bucket_word_counts__} = 0;
99     $self->{db_get_full_total__} = 0;
100     $self->{db_get_bucket_parameter__} = 0;
101     $self->{db_set_bucket_parameter__} = 0;
102     $self->{db_get_bucket_parameter_default__} = 0;
103     $self->{db_get_buckets_with_magnets__} = 0;
104     $self->{db_delete_zero_words__} = 0;
105
106     # Caches the name of each bucket and relates it to both the bucket
107     # ID in the database and whether it is pseudo or not
108     #
109     # Subkeys used are:
110     #
111     # id     The bucket ID in the database
112     # pseudo 1 if this is a pseudo bucket
113
114     $self->{db_bucketid__}       = {};
115
116     # Caches the IDs that map to parameter types
117
118     $self->{db_parameterid__}    = {};
119
120     # Caches looked up parameter values on a per bucket basis
121
122     $self->{db_parameters__}     = {};
123
124     # Used to parse mail messages
125     $self->{parser__}            = new Classifier::MailParse;
126
127     # The possible colors for buckets
128     $self->{possible_colors__} = [ 'red',       'green',      'blue',       'brown', # PROFILE BLOCK START
129                                    'orange',    'purple',     'magenta',    'gray',
130                                    'plum',      'silver',     'pink',       'lightgreen',
131                                    'lightblue', 'lightcyan',  'lightcoral', 'lightsalmon',
132                                    'lightgrey', 'darkorange', 'darkcyan',   'feldspar',
133                                    'black' ];                                        # PROFILE BLOCK STOP
134
135     # Precomputed per bucket probabilities
136     $self->{bucket_start__}      = {};
137
138     # A very unlikely word
139     $self->{not_likely__}        = {};
140
141     # The expected corpus version
142     #
143     # DEPRECATED  This is only used when upgrading old flat file corpus files
144     #             to the database
145     $self->{corpus_version__}    = 1;
146
147     # The unclassified cutoff this value means that the top
148     # probabilily must be n times greater than the second probability,
149     # default is 100 times more likely
150     $self->{unclassified__}      = log(100);
151
152     # Used to tell the caller whether a magnet was used in the last
153     # mail classification
154     $self->{magnet_used__}       = 0;
155     $self->{magnet_detail__}     = 0;
156
157     # This maps session keys (long strings) to user ids.  If there's
158     # an entry here then the session key is valid and can be used in
159     # the POPFile API.  See the methods get_session_key and
160     # release_session_key for details
161
162     $self->{api_sessions__}      = {};
163
164     # Used to indicate whether we are using SQLite and what the full
165     # path and name of the database is if we are.
166
167     $self->{db_is_sqlite__}      = 0;
168     $self->{db_name__}           = '';
169
170     # Must call bless before attempting to call any methods
171
172     bless $self, $type;
173
174     $self->name( 'bayes' );
175
176     return $self;
177 }
178
179 #----------------------------------------------------------------------------
180 #
181 # forked
182 #
183 # This is called inside a child process that has just forked, since
184 # the child needs access to the database we open it
185 #
186 #----------------------------------------------------------------------------
187 sub forked
188 {
189     my ( $self ) = @_;
190
191     $self->db_connect__();
192 }
193
194 #----------------------------------------------------------------------------
195 #
196 # childexit
197 #
198 # This is called inside a child process that is about to finish, since
199 # the child does not need access to the database we close it
200 #
201 #----------------------------------------------------------------------------
202 sub childexit
203 {
204     my ( $self ) = @_;
205
206     $self->db_disconnect__();
207 }
208
209 #----------------------------------------------------------------------------
210 #
211 # initialize
212 #
213 # Called to set up the Bayes module's parameters
214 #
215 #----------------------------------------------------------------------------
216 sub initialize
217 {
218     my ( $self ) = @_;
219
220     # This is the name for the database
221
222     $self->config_( 'database', 'popfile.db' );
223
224     # This is the 'connect' string used by DBI to connect to the
225     # database, if you decide to change from using SQLite to some
226     # other database (e.g. MySQL, Oracle, ... ) this *should* be all
227     # you need to change.  The additional parameters user and auth are
228     # needed for some databases.
229     #
230     # Note that the dbconnect string
231     # will be interpolated before being passed to DBI and the variable
232     # $dbname can be used within it and it resolves to the full path
233     # to the database named in the database parameter above.
234
235     $self->config_( 'dbconnect', 'dbi:SQLite:dbname=$dbname' );
236     $self->config_( 'dbuser', '' ); $self->config_( 'dbauth', '' );
237
238     # SQLite 1.05+ had some problems we've resolved.
239     # This parameter is no longer used but we leave it for future use
240
241     $self->config_( 'bad_sqlite_version', '4.0.0' );
242
243     # No default unclassified weight is the number of times more sure
244     # POPFile must be of the top class vs the second class, default is
245     # 100 times more
246
247     $self->config_( 'unclassified_weight', 100 );
248
249     # The corpus is kept in the 'corpus' subfolder of POPFile
250     #
251     # DEPRECATED This is only used to find an old corpus that might
252     # need to be upgraded
253
254     $self->config_( 'corpus', 'corpus' );
255
256     # The characters that appear before and after a subject
257     # modification
258
259     $self->config_( 'subject_mod_left',  '[' );
260     $self->config_( 'subject_mod_right', ']' );
261
262     # Get the hostname for use in the X-POPFile-Link header
263
264     $self->{hostname__} = hostname;
265
266     # Allow the user to override the hostname
267
268     $self->config_( 'hostname', $self->{hostname__} );
269
270     # If set to 1 then the X-POPFile-Link will have < > around the URL
271     # (i.e. X-POPFile-Link: <http://foo.bar>) when set to 0 there are
272     # none (i.e. X-POPFile-Link: http://foo.bar)
273
274     $self->config_( 'xpl_angle', 0 );
275
276     # This parameter is used when the UI is operating in Stealth Mode.
277     # If left blank (the default setting) the X-POPFile-Link will use 127.0.0.1
278     # otherwise it will use this string instead. The system's HOSTS file should
279     # map the string to 127.0.0.1
280
281     $self->config_( 'localhostname', '' );
282
283     # This is a bit mask used to control options when we are using the
284     # default SQLite database.  By default all the options are on.
285     #
286     # 1 = Asynchronous deletes
287     # 2 = Backup database every hour
288
289     $self->config_( 'sqlite_tweaks', 0xFFFFFFFF );
290
291     # Japanese wakachigaki parser ('kakasi' or 'mecab' or 'internal').
292     $self->config_( 'nihongo_parser', 'kakasi' );
293
294     $self->mq_register_( 'COMIT', $self );
295     $self->mq_register_( 'RELSE', $self );
296
297     # Register for the TICKD message which is sent hourly by the
298     # Logger module.  We use this to hourly save the database if bit 1
299     # of the sqlite_tweaks is set and we are using SQLite
300
301     $self->mq_register_( 'TICKD', $self );
302
303     return 1;
304 }
305
306 #----------------------------------------------------------------------------
307 #
308 # deliver
309 #
310 # Called by the message queue to deliver a message
311 #
312 # There is no return value from this method
313 #
314 #----------------------------------------------------------------------------
315 sub deliver
316 {
317     my ( $self, $type, @message ) = @_;
318
319     if ( $type eq 'COMIT' ) {
320         $self->classified( $message[0], $message[2] );
321     }
322
323     if ( $type eq 'RELSE' ) {
324         $self->release_session_key_private__( $message[0] );
325     }
326
327     if ( $type eq 'TICKD' ) {
328         $self->backup_database__();
329     }
330 }
331
332 #----------------------------------------------------------------------------
333 #
334 # start
335 #
336 # Called to start the Bayes module running
337 #
338 #----------------------------------------------------------------------------
339 sub start
340 {
341     my ( $self ) = @_;
342
343     # In Japanese or Korean or Chinese mode, explicitly set LC_COLLATE to C.
344     #
345     # This is to avoid Perl crash on Windows because default
346     # LC_COLLATE of Japanese Win is Japanese_Japan.932(Shift_JIS),
347     # which is different from the charset POPFile uses for Japanese
348     # characters(EUC-JP).
349
350     my $lang = $self->module_config_( 'html', 'language' ) || '';
351
352     if ( $lang =~ /^(Nihongo$|Korean$|Chinese)/ ) {
353         use POSIX qw( locale_h );
354         setlocale( LC_COLLATE, 'C' );
355     }
356
357     # Pass in the current interface language for language specific parsing
358
359     $self->{parser__}->{lang__}  = $lang;
360     $self->{unclassified__} = log( $self->config_( 'unclassified_weight' ) );
361
362     if ( !$self->db_connect__() ) {
363         return 0;
364     }
365
366     if ( $lang eq 'Nihongo' ) {
367         # Setup Nihongo (Japanese) parser.
368
369         my $nihongo_parser = $self->config_( 'nihongo_parser' );
370
371         $nihongo_parser = $self->{parser__}->setup_nihongo_parser( $nihongo_parser );
372
373         $self->log_( 2, "Use Nihongo (Japanese) parser : $nihongo_parser" );
374         $self->config_( 'nihongo_parser', $nihongo_parser );
375
376         # Since Text::Kakasi is not thread-safe, we use it under the
377         # control of a Mutex to avoid a crash if we are running on
378         # Windows and using the fork.
379
380         if ( ( $nihongo_parser eq 'kakasi' ) && ( $^O eq 'MSWin32' ) &&
381              ( ( ( $self->module_config_( 'pop3', 'enabled' ) ) &&
382                  ( $self->module_config_( 'pop3', 'force_fork' ) ) ) ||
383                ( ( $self->module_config_( 'nntp', 'enabled' ) ) &&
384                  ( $self->module_config_( 'nntp', 'force_fork' ) ) ) ||
385                ( ( $self->module_config_( 'smtp', 'enabled' ) ) &&
386                  ( $self->module_config_( 'smtp', 'force_fork' ) ) ) ) ) {
387             $self->{parser__}->{need_kakasi_mutex__} = 1;
388
389             # Prepare the Mutex.
390             require POPFile::Mutex;
391             $self->{parser__}->{kakasi_mutex__} = new POPFile::Mutex( 'mailparse_kakasi' );
392             $self->log_( 2, "Create mutex for Kakasi." );
393         }
394     }
395
396     $self->upgrade_predatabase_data__();
397
398     return 1;
399 }
400
401 #----------------------------------------------------------------------------
402 #
403 # stop
404 #
405 # Called when POPFile is terminating
406 #
407 #----------------------------------------------------------------------------
408 sub stop
409 {
410     my ( $self ) = @_;
411
412     $self->db_disconnect__();
413     delete $self->{parser__};
414 }
415
416 #----------------------------------------------------------------------------
417 #
418 # classified
419 #
420 # Called to inform the module about a classification event
421 #
422 # There is no return value from this method
423 #
424 #----------------------------------------------------------------------------
425 sub classified
426 {
427     my ( $self, $session, $class ) = @_;
428
429     $self->set_bucket_parameter( $session, $class, 'count',             # PROFILE BLOCK START
430         $self->get_bucket_parameter( $session, $class, 'count' ) + 1 ); # PROFILE BLOCK STOP
431 }
432
433 #----------------------------------------------------------------------------
434 #
435 # backup_database__
436 #
437 # Called when the TICKD message is received each hour and if we are using
438 # the default SQLite database will make a copy with the .backup extension
439 #
440 #----------------------------------------------------------------------------
441 sub backup_database__
442 {
443     my ( $self ) = @_;
444
445     # If database backup is turned on and we are using SQLite then
446     # backup the database by copying it
447
448     if ( ( $self->config_( 'sqlite_tweaks' ) & 2 ) &&
449          $self->{db_is_sqlite__} ) {
450         if ( !copy( $self->{db_name__}, $self->{db_name__} . ".backup" ) ) {
451         $self->log_( 0, "Failed to backup database ".$self->{db_name__} );
452         }
453     }
454 }
455
456 #----------------------------------------------------------------------------
457 #
458 # tweak_sqlite
459 #
460 # Called when a module wants is to tweak access to the SQLite database.
461 #
462 # $tweak    The tweak to apply (a bit in the sqlite_tweaks mask)
463 # $state    1 to enable the tweak, 0 to disable
464 # $db       The db handle to tweak
465 #
466 #----------------------------------------------------------------------------
467 sub tweak_sqlite
468 {
469     my ( $self, $tweak, $state, $db ) = @_;
470
471     if ( $self->{db_is_sqlite__} &&
472          ( $self->config_( 'sqlite_tweaks' ) & $tweak ) ) {
473
474         $self->log_( 1, "Performing tweak $tweak to $state" );
475
476         if ( $tweak == 1 ) {
477             my $sync = $state?'off':'normal';
478             $db->do( "pragma synchronous=$sync;" );
479         }
480     }
481 }
482
483 #----------------------------------------------------------------------------
484 #
485 # reclassified
486 #
487 # Called to inform the module about a reclassification from one bucket
488 # to another
489 #
490 # session            Valid API session
491 # bucket             The old bucket name
492 # newbucket          The new bucket name
493 # undo               1 if this is an undo operation
494 #
495 # There is no return value from this method
496 #
497 #----------------------------------------------------------------------------
498 sub reclassified
499 {
500     my ( $self, $session, $bucket, $newbucket, $undo ) = @_;
501
502     $self->log_( 0, "Reclassification from $bucket to $newbucket" );
503
504     my $c = $undo?-1:1;
505
506     if ( $bucket ne $newbucket ) {
507         my $count = $self->get_bucket_parameter(
508                         $session, $newbucket, 'count' );
509         my $newcount = $count + $c;
510         $newcount = 0 if ( $newcount < 0 );
511         $self->set_bucket_parameter(
512             $session, $newbucket, 'count', $newcount );
513
514         $count = $self->get_bucket_parameter(
515                      $session, $bucket, 'count' );
516         $newcount = $count - $c;
517         $newcount = 0 if ( $newcount < 0 );
518         $self->set_bucket_parameter(
519             $session, $bucket, 'count', $newcount );
520
521         my $fncount = $self->get_bucket_parameter(
522                           $session, $newbucket, 'fncount' );
523         my $newfncount = $fncount + $c;
524         $newfncount = 0 if ( $newfncount < 0 );
525         $self->set_bucket_parameter(
526             $session, $newbucket, 'fncount', $newfncount );
527
528         my $fpcount = $self->get_bucket_parameter(
529                           $session, $bucket, 'fpcount' );
530         my $newfpcount = $fpcount + $c;
531         $newfpcount = 0 if ( $newfpcount < 0 );
532         $self->set_bucket_parameter(
533             $session, $bucket, 'fpcount', $newfpcount );
534     }
535 }
536
537 #----------------------------------------------------------------------------
538 #
539 # get_color
540 #
541 # Retrieves the color for a specific word, color is the most likely bucket
542 #
543 # $session  Session key returned by get_session_key
544 # $word     Word to get the color of
545 #
546 #----------------------------------------------------------------------------
547 sub get_color
548 {
549     my ( $self, $session, $word ) = @_;
550
551     my $max   = -10000;
552     my $color = 'black';
553
554     for my $bucket ($self->get_buckets( $session )) {
555         my $prob = $self->get_value_( $session, $bucket, $word );
556
557         if ( $prob != 0 )  {
558             if ( $prob > $max )  {
559                 $max   = $prob;
560                 $color = $self->get_bucket_parameter( $session, $bucket,
561                              'color' );
562             }
563         }
564     }
565
566     return $color;
567 }
568
569 #----------------------------------------------------------------------------
570 #
571 # get_not_likely_
572 #
573 # Returns the probability of a word that doesn't appear
574 #
575 #----------------------------------------------------------------------------
576 sub get_not_likely_
577 {
578     my ( $self, $session ) = @_;
579
580     my $userid = $self->valid_session_key__( $session );
581     return undef if ( !defined( $userid ) );
582
583     return $self->{not_likely__}{$userid};
584 }
585
586 #----------------------------------------------------------------------------
587 #
588 # get_value_
589 #
590 # Returns the value for a specific word in a bucket.  The word is
591 # converted to the log value of the probability before return to get
592 # the raw value just hit the hash directly or call get_base_value_
593 #
594 #----------------------------------------------------------------------------
595 sub get_value_
596 {
597     my ( $self, $session, $bucket, $word ) = @_;
598
599     my $value = $self->db_get_word_count__( $session, $bucket, $word );
600
601     if ( defined( $value ) && ( $value > 0 ) ) {
602
603         # Profiling notes:
604         #
605         # I tried caching the log of the total value and then doing
606         # log( $value ) - $cached and this turned out to be
607         # much slower than this single log with a division in it
608
609         return log( $value /
610                     $self->get_bucket_word_count( $session, $bucket ) );
611     } else {
612         return 0;
613     }
614 }
615
616 sub get_base_value_
617 {
618     my ( $self, $session, $bucket, $word ) = @_;
619
620     my $value = $self->db_get_word_count__( $session, $bucket, $word );
621
622     if ( defined( $value ) ) {
623         return $value;
624     } else {
625         return 0;
626     }
627 }
628
629 #----------------------------------------------------------------------------
630 #
631 # set_value_
632 #
633 # Sets the value for a word in a bucket and updates the total word
634 # counts for the bucket and globally
635 #
636 #----------------------------------------------------------------------------
637 sub set_value_
638 {
639     my ( $self, $session, $bucket, $word, $value ) = @_;
640
641     if ( $self->db_put_word_count__( $session, $bucket,
642              $word, $value ) == 1 ) {
643
644         # If we set the word count to zero then clean it up by deleting the
645         # entry
646
647         my $userid = $self->valid_session_key__( $session );
648         my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
649         $self->validate_sql_prepare_and_execute( $self->{db_delete_zero_words__}, $bucketid );
650
651         return 1;
652     } else {
653         return 0;
654     }
655 }
656
657 #----------------------------------------------------------------------------
658 #
659 # get_sort_value_ behaves the same as get_value_, except that it
660 # returns not_likely__ rather than 0 if the word is not found.  This
661 # makes its result more suitable as a sort key for bucket ranking.
662 #
663 #----------------------------------------------------------------------------
664 sub get_sort_value_
665 {
666     my ( $self, $session, $bucket, $word ) = @_;
667
668     my $v = $self->get_value_( $session, $bucket, $word );
669
670     if ( $v == 0 ) {
671
672         my $userid = $self->valid_session_key__( $session );
673         return undef if ( !defined( $userid ) );
674
675         return $self->{not_likely__}{$userid};
676     } else {
677         return $v;
678     }
679 }
680
681 #----------------------------------------------------------------------------
682 #
683 # update_constants__
684 #
685 # Updates not_likely and bucket_start
686 #
687 #----------------------------------------------------------------------------
688 sub update_constants__
689 {
690     my ( $self, $session ) = @_;
691
692     my $wc = $self->get_word_count( $session );
693
694     my $userid = $self->valid_session_key__( $session );
695     return undef if ( !defined( $userid ) );
696
697     if ( defined( $wc ) && $wc > 0 )  {
698         $self->{not_likely__}{$userid} = -log( 10 * $wc );
699
700         foreach my $bucket ($self->get_buckets( $session )) {
701             my $total = $self->get_bucket_word_count( $session, $bucket );
702
703             if ( $total != 0 ) {
704                 $self->{bucket_start__}{$userid}{$bucket} = log( $total /
705                                                                  $wc );
706             } else {
707                 $self->{bucket_start__}{$userid}{$bucket} = 0;
708             }
709         }
710     } else {
711         $self->{not_likely__}{$userid} = 0;
712     }
713 }
714
715 #----------------------------------------------------------------------------
716 #
717 # db_connect__
718 #
719 # Connects to the POPFile database and returns 1 if successful
720 #
721 #----------------------------------------------------------------------------
722 sub db_connect__
723 {
724     my ( $self ) = @_;
725
726     # Connect to the database, note that the database must exist for
727     # this to work, to make this easy for people POPFile we will
728     # create the database automatically here using the file
729     # 'popfile.sql' which should be located in the same directory the
730     # Classifier/Bayes.pm module
731
732     # If we are using SQLite then the dbname is actually the name of a
733     # file, and hence we treat it like one, otherwise we leave it
734     # alone
735
736     my $dbname;
737     my $dbconnect = $self->config_( 'dbconnect' );
738     my $dbpresent;
739     my $sqlite = ( $dbconnect =~ /sqlite/i );
740     my $mysql  = ( $dbconnect =~ /mysql/i );
741     my %connection_options = ();
742
743     if ( $sqlite ) {
744         $dbname = $self->get_user_path_( $self->config_( 'database' ) );
745         $dbpresent = ( -e $dbname ) || 0;
746     } else {
747         $dbname = $self->config_( 'database' );
748         $dbpresent = 1;
749
750         if ( $mysql ) {
751
752             # Turn on auto_reconnect
753
754             $connection_options{mysql_auto_reconnect} = 1;
755         }
756     }
757
758     # Record whether we are using SQLite or not and the name of the
759     # database so that other routines can access it; this is used by
760     # the backup_database__ routine to make a backup copy of the
761     # database when using SQLite.
762
763     $self->{db_is_sqlite__} = $sqlite;
764     $self->{db_name__}      = $dbname;
765
766     # Now perform the connect, note that this is database independent
767     # at this point, the actual database that we connect to is defined
768     # by the dbconnect parameter.
769
770     $dbconnect =~ s/\$dbname/$dbname/g;
771
772     $self->log_( 0, "Attempting to connect to $dbconnect ($dbpresent)" );
773
774     my $need_convert = 0;
775     my $old_dbh;
776
777     if ( $sqlite && $dbpresent ) {
778
779         # Check if the database is SQLite2 format
780
781         open DBFILE, $dbname;
782         my $buffer;
783         my $readed = sysread( DBFILE, $buffer, 47 );
784         close DBFILE;
785
786         if ( $buffer eq '** This file contains an SQLite 2.1 database **' ) {
787             $self->log_( 0, 'SQLite 2 database found. Try to upgrade' );
788
789             # Test DBD::SQLite version
790
791             my $ver = -1;
792             eval {
793                 require DBD::SQLite;
794                 $ver = $DBD::SQLite::VERSION;
795             };
796
797             if ( $ver >= 1.00 ) {
798                 $self->log_( 0, "DBD::SQLite $ver found" );
799
800                 # Backup SQLite2 database
801
802                 my $old_dbname = $dbname . '-sqlite2';
803                 unlink $old_dbname;
804                 rename $dbname, $old_dbname;
805
806                 # Connect to SQLite2 database
807
808                 my $old_dbconnect = $self->config_( 'dbconnect' );
809                 $old_dbconnect =~ s/SQLite:/SQLite2:/;
810                 $old_dbconnect =~ s/\$dbname/$old_dbname/g;
811
812                 $old_dbh = DBI->connect( $old_dbconnect,               # PROFILE BLOCK START
813                                          $self->config_( 'dbuser' ),
814                                          $self->config_( 'dbauth' ) ); # PROFILE BLOCK STOP
815
816                 # Update the config file
817
818                 $dbconnect = $self->config_( 'dbconnect' );
819                 $dbconnect =~ s/SQLite2:/SQLite:/;
820                 $self->config_( 'dbconnect', $dbconnect );
821                 $dbconnect =~ s/\$dbname/$dbname/g;
822
823                 $need_convert = 1;
824             }
825         } else {
826
827             # Update the config file
828
829             $dbconnect = $self->config_( 'dbconnect' );
830             $dbconnect =~ s/SQLite2:/SQLite:/;
831             $self->config_( 'dbconnect', $dbconnect );
832             $dbconnect =~ s/\$dbname/$dbname/g;
833         }
834     }
835
836     $self->{db__} = DBI->connect( $dbconnect,                 # PROFILE BLOCK START
837                                   $self->config_( 'dbuser' ),
838                                   $self->config_( 'dbauth' ),
839                                   \%connection_options );     # PROFILE BLOCK STOP
840
841     if ( !defined( $self->{db__} ) ) {
842         $self->log_( 0, "Failed to connect to database and got error $DBI::errstr" );
843         return 0;
844     }
845
846     if ( $sqlite ) {
847         $self->log_( 0, "Using SQLite library version " . $self->{db__}{sqlite_version} );
848
849         if ( $need_convert ) {
850             $self->log_( 0, 'Convert SQLite2 database to SQLite3 database' );
851
852             $self->db_upgrade__( $old_dbh );
853             $old_dbh->disconnect;
854
855             $self->log_( 0, 'Database convert completed' );
856         }
857
858         # For Japanese compatibility
859
860         if ( $self->{parser__}->{lang__} eq 'Nihongo' ) {
861             $self->{db__}->do( 'pragma case_sensitive_like=1;' );
862         }
863     }
864
865     if ( !$dbpresent ) {
866         if ( !$self->insert_schema__( $sqlite ) ) {
867             return 0;
868         }
869     }
870
871     # Now check for a need to upgrade the database because the schema
872     # has been changed.  From POPFile v0.22.0 there's a special
873     # 'popfile' table inside the database that contains the schema
874     # version number.  If the version number doesn't match or is
875     # missing then do the upgrade.
876
877     open SCHEMA, '<' . $self->get_root_path_( 'Classifier/popfile.sql' );
878     <SCHEMA> =~ /-- POPFILE SCHEMA (\d+)/;
879     my $version = $1;
880     close SCHEMA;
881
882     my $need_upgrade = 1;
883
884     #
885     # retrieve the SQL_IDENTIFIER_QUOTE_CHAR for the database then use it
886     # to strip off any sqlquotechars from the table names we retrieve
887     #
888
889     my $sqlquotechar = $self->{db__}->get_info(29) || '';
890     my @tables = map { s/$sqlquotechar//g; $_ } ($self->{db__}->tables());
891
892     foreach my $table (@tables) {
893         if ( $table =~ /\.?popfile$/ ) {
894             my @row = $self->{db__}->selectrow_array(
895                'select version from popfile;' );
896
897             if ( $#row == 0 ) {
898                 $need_upgrade = ( $row[0] != $version );
899             }
900         }
901     }
902
903     if ( $need_upgrade ) {
904
905         print "\n\nDatabase schema is outdated, performing automatic upgrade\n";
906
907         # The database needs upgrading
908
909         $self->db_upgrade__();
910
911         print "\nDatabase upgrade complete\n\n";
912     }
913
914     # Now prepare common SQL statements for use, as a matter of convention the
915     # parameters to each statement always appear in the following order:
916     #
917     # user
918     # bucket
919     # word
920     # parameter
921
922     $self->{db_get_buckets__} = $self->{db__}->prepare(                                 # PROFILE BLOCK START
923              'select name, id, pseudo from buckets
924                   where buckets.userid = ?;' );                                         # PROFILE BLOCK STOP
925
926     $self->{db_get_wordid__} = $self->{db__}->prepare(                                  # PROFILE BLOCK START
927              'select id from words
928                   where words.word = ? limit 1;' );                                     # PROFILE BLOCK STOP
929
930     $self->{db_get_userid__} = $self->{db__}->prepare(                                  # PROFILE BLOCK START
931              'select id from users where name = ?
932                                      and password = ? limit 1;' );                      # PROFILE BLOCK STOP
933
934     $self->{db_get_word_count__} = $self->{db__}->prepare(                              # PROFILE BLOCK START
935              'select matrix.times from matrix
936                   where matrix.bucketid = ? and
937                         matrix.wordid = ? limit 1;' );                                  # PROFILE BLOCK STOP
938
939     $self->{db_put_word_count__} = $self->{db__}->prepare(                              # PROFILE BLOCK START
940            'replace into matrix ( bucketid, wordid, times ) values ( ?, ?, ? );' );     # PROFILE BLOCK STOP
941
942     $self->{db_get_bucket_unique_counts__} = $self->{db__}->prepare(                    # PROFILE BLOCK START
943              'select count(matrix.wordid), buckets.name from matrix, buckets
944                   where buckets.userid = ?
945                     and matrix.bucketid = buckets.id
946                   group by buckets.name;' );                                            # PROFILE BLOCK STOP
947
948     $self->{db_get_bucket_word_counts__} = $self->{db__}->prepare(                      # PROFILE BLOCK START
949              'select sum(matrix.times), buckets.name from matrix, buckets
950                   where matrix.bucketid = buckets.id
951                     and buckets.userid = ?
952                     group by buckets.name;' );                                          # PROFILE BLOCK STOP
953
954     $self->{db_get_unique_word_count__} = $self->{db__}->prepare(                       # PROFILE BLOCK START
955              'select count(matrix.wordid) from matrix, buckets
956                   where matrix.bucketid = buckets.id and
957                         buckets.userid = ?;' );                                         # PROFILE BLOCK STOP
958
959     $self->{db_get_full_total__} = $self->{db__}->prepare(                              # PROFILE BLOCK START
960              'select sum(matrix.times) from matrix, buckets
961                   where buckets.userid = ? and
962                         matrix.bucketid = buckets.id;' );                               # PROFILE BLOCK STOP
963
964     $self->{db_get_bucket_parameter__} = $self->{db__}->prepare(                        # PROFILE BLOCK START
965              'select bucket_params.val from bucket_params
966                   where bucket_params.bucketid = ? and
967                         bucket_params.btid = ?;' );                                     # PROFILE BLOCK STOP
968
969     $self->{db_set_bucket_parameter__} = $self->{db__}->prepare(                        # PROFILE BLOCK START
970            'replace into bucket_params ( bucketid, btid, val ) values ( ?, ?, ? );' );  # PROFILE BLOCK STOP
971
972     $self->{db_get_bucket_parameter_default__} = $self->{db__}->prepare(                # PROFILE BLOCK START
973              'select bucket_template.def from bucket_template
974                   where bucket_template.id = ?;' );                                     # PROFILE BLOCK STOP
975
976     $self->{db_get_buckets_with_magnets__} = $self->{db__}->prepare(                    # PROFILE BLOCK START
977              'select buckets.name from buckets, magnets
978                   where buckets.userid = ? and
979                         magnets.id != 0 and
980                         magnets.bucketid = buckets.id group by buckets.name order by buckets.name;' );
981                                                                                         # PROFILE BLOCK STOP
982     $self->{db_delete_zero_words__} = $self->{db__}->prepare(                           # PROFILE BLOCK START
983              'delete from matrix
984                   where matrix.times = 0
985                     and matrix.bucketid = ?;' );                                        # PROFILE BLOCK STOP
986
987     # Get the mapping from parameter names to ids into a local hash
988
989     my $h = $self->validate_sql_prepare_and_execute( "select name, id from bucket_template;" );
990     while ( my $row = $h->fetchrow_arrayref ) {
991         $self->{db_parameterid__}{$row->[0]} = $row->[1];
992     }
993     $h->finish;
994
995     return 1;
996 }
997
998 #----------------------------------------------------------------------------
999 #
1000 # insert_schema__
1001 #
1002 # Insert the POPFile schema in a database
1003 #
1004 # $sqlite          Set to 1 if this is a SQLite database
1005 #
1006 #----------------------------------------------------------------------------
1007 sub insert_schema__
1008 {
1009     my ( $self, $sqlite ) = @_;
1010
1011     if ( -e $self->get_root_path_( 'Classifier/popfile.sql' ) ) {
1012         my $schema = '';
1013
1014         $self->log_( 0, "Creating database schema" );
1015
1016         open SCHEMA, '<' . $self->get_root_path_( 'Classifier/popfile.sql' );
1017         while ( <SCHEMA> ) {
1018             next if ( /^--/ );
1019             next if ( !/[a-z;]/ );
1020             s/--.*$//;
1021
1022             # If the line begins 'alter' and we are doing SQLite then ignore
1023             # the line
1024
1025             if ( $sqlite && ( /^alter/i ) ) {
1026                 next;
1027             }
1028
1029             $schema .= $_;
1030
1031             if ( ( /end;/ ) || ( /\);/ ) || ( /^alter/i ) ) {
1032                 $self->{db__}->do( $schema );
1033                 $schema = '';
1034             }
1035         }
1036         close SCHEMA;
1037         return 1;
1038     } else {
1039         $self->log_( 0, "Can't find the database schema" );
1040         return 0;
1041     }
1042 }
1043
1044 #----------------------------------------------------------------------------
1045 #
1046 # db_upgrade__
1047 #
1048 # Upgrade the POPFile schema / Convert the database
1049 #
1050 # $db_from         Database handle convert from
1051 #                  undef if upgrade POPFile schema
1052 #
1053 #----------------------------------------------------------------------------
1054 sub db_upgrade__
1055 {
1056     my ( $self, $db_from ) = @_;
1057
1058     my $drop_table;
1059
1060     if ( !defined( $db_from ) ) {
1061         # Upgrade
1062
1063         $drop_table = 1;
1064         $db_from = $self->{db__};
1065     }
1066
1067     my $from_sqlite = ( $db_from->{Driver}->{Name} =~ /SQLite/ );
1068     my $to_sqlite = ( $self->{db__}->{Driver}->{Name} =~ /SQLite/ );
1069
1070     my $sqlquotechar = $db_from->get_info(29) || '';
1071     my @tables = map { s/$sqlquotechar//g; $_ } ($db_from->tables());
1072
1073     # We are going to dump out all the data in the database as
1074     # INSERT OR IGNORE statements in a temporary file, then DROP all
1075     # the tables in the database, then recreate the schema from the
1076     # new schema and finally rerun the inserts.
1077
1078     my $i = 0;
1079     my $ins_file = $self->get_user_path_( 'insert.sql' );
1080     open INSERT, '>' . $ins_file;
1081
1082     foreach my $table (@tables) {
1083         next if ( $table =~ /\.?popfile$/ );
1084         if ( $from_sqlite && ( $table =~ /^sqlite_/ ) ) {
1085             next;
1086         }
1087         if ( $i > 99 ) {
1088             print "\n";
1089         }
1090         print "    Saving table $table\n    ";
1091
1092         my $t = $db_from->prepare( "select * from $table;" );
1093         $t->execute;
1094         $i = 0;
1095         while ( 1 ) {
1096             if ( ( ++$i % 100 ) == 0 ) {
1097                 print "[$i]";
1098                 flush STDOUT;
1099             }
1100             if ( ( $i % 1000 ) == 0 ) {
1101                 print "\n";
1102                 flush STDOUT;
1103             }
1104             my $rows = $t->fetchrow_arrayref;
1105
1106             last if ( !defined( $rows ) );
1107
1108             if ( $to_sqlite ) {
1109                 print INSERT "INSERT OR IGNORE INTO $table (";
1110             } else {
1111                 print INSERT "INSERT INTO $table (";
1112             }
1113             for my $i (0..$t->{NUM_OF_FIELDS}-1) {
1114                 if ( $i != 0 ) {
1115                     print INSERT ',';
1116                 }
1117                 print INSERT $t->{NAME}->[$i];
1118             }
1119             print INSERT ') VALUES (';
1120             for my $i (0..$t->{NUM_OF_FIELDS}-1) {
1121                 if ( $i != 0 ) {
1122                     print INSERT ',';
1123                 }
1124                 my $val = $rows->[$i];
1125                 if ( $t->{TYPE}->[$i] !~ /^int/i ) {
1126                     $val = '' if ( !defined( $val ) );
1127                     $val = $self->db_quote( $val );
1128                 } else {
1129                     $val = 'NULL' if ( !defined( $val ) );
1130                 }
1131                 print INSERT $val;
1132             }
1133             print INSERT ");\n";
1134         }
1135         $t->finish;
1136     }
1137
1138     close INSERT;
1139
1140     if ( $i > 99 ) {
1141         print "\n";
1142     }
1143
1144     if ( $drop_table ) {
1145         foreach my $table (@tables) {
1146             if ( $from_sqlite && ( $table =~ /^sqlite_/ ) ) {
1147                 next;
1148             }
1149             print "    Dropping old table $table\n";
1150             $self->{db__}->do( "DROP TABLE $table;" );
1151         }
1152     }
1153
1154     print "    Inserting new database schema\n";
1155     if ( !$self->insert_schema__( $to_sqlite ) ) {
1156         return 0;
1157     }
1158
1159     print "    Restoring old data\n    ";
1160
1161     $self->{db__}->begin_work;
1162     open INSERT, '<' . $ins_file;
1163     $i = 0;
1164     while ( <INSERT> ) {
1165         if ( ( ++$i % 100 ) == 0 ) {
1166            print "[$i]";
1167            flush STDOUT;
1168         }
1169         if ( ( $i % 1000 ) == 0 ) {
1170             print "\n";
1171             flush STDOUT;
1172         }
1173         s/[\r\n]//g;
1174         $self->{db__}->do( $_ );
1175     }
1176     close INSERT;
1177     $self->{db__}->commit;
1178
1179     unlink $ins_file;
1180 }
1181
1182 #----------------------------------------------------------------------------
1183 #
1184 # db_disconnect__
1185 #
1186 # Disconnect from the POPFile database
1187 #
1188 #----------------------------------------------------------------------------
1189 sub db_disconnect__
1190 {
1191     my ( $self ) = @_;
1192
1193     $self->{db_get_buckets__}->finish;
1194     $self->{db_get_wordid__}->finish;
1195     $self->{db_get_userid__}->finish;
1196     $self->{db_get_word_count__}->finish;
1197     $self->{db_put_word_count__}->finish;
1198     $self->{db_get_bucket_unique_counts__}->finish;
1199     $self->{db_get_bucket_word_counts__}->finish;
1200     $self->{db_get_unique_word_count__}->finish;
1201     $self->{db_get_full_total__}->finish;
1202     $self->{db_get_bucket_parameter__}->finish;
1203     $self->{db_set_bucket_parameter__}->finish;
1204     $self->{db_get_bucket_parameter_default__}->finish;
1205     $self->{db_get_buckets_with_magnets__}->finish;
1206     $self->{db_delete_zero_words__}->finish;
1207
1208     # Avoid DBD::SQLite 'closing dbh with active statement handles' bug
1209
1210     undef $self->{db_get_buckets__};
1211     undef $self->{db_get_wordid__};
1212     undef $self->{db_get_userid__};
1213     undef $self->{db_get_word_count__};
1214     undef $self->{db_put_word_count__};
1215     undef $self->{db_get_bucket_unique_counts__};
1216     undef $self->{db_get_bucket_word_counts__};
1217     undef $self->{db_get_unique_word_count__};
1218     undef $self->{db_get_full_total__};
1219     undef $self->{db_get_bucket_parameter__};
1220     undef $self->{db_set_bucket_parameter__};
1221     undef $self->{db_get_bucket_parameter_default__};
1222     undef $self->{db_get_buckets_with_magnets__};
1223     undef $self->{db_delete_zero_words__};
1224
1225     if ( defined( $self->{db__} ) ) {
1226         $self->{db__}->disconnect;
1227         undef $self->{db__};
1228     }
1229 }
1230
1231 #----------------------------------------------------------------------------
1232 #
1233 # db_update_cache__
1234 #
1235 # Updates our local cache of user and bucket ids.
1236 #
1237 # $session           Must be a valid session
1238 #
1239 #----------------------------------------------------------------------------
1240 sub db_update_cache__
1241 {
1242     my ( $self, $session ) = @_;
1243
1244     my $userid = $self->valid_session_key__( $session );
1245     return undef if ( !defined( $userid ) );
1246
1247     delete $self->{db_bucketid__}{$userid};
1248
1249     $self->validate_sql_prepare_and_execute( $self->{db_get_buckets__}, $userid );
1250     while ( my $row = $self->{db_get_buckets__}->fetchrow_arrayref ) {
1251         $self->{db_bucketid__}{$userid}{$row->[0]}{id} = $row->[1];
1252         $self->{db_bucketid__}{$userid}{$row->[0]}{pseudo} = $row->[2];
1253         $self->{db_bucketcount__}{$userid}{$row->[0]} = 0;
1254     }
1255
1256     $self->validate_sql_prepare_and_execute( $self->{db_get_bucket_word_counts__}, $userid );
1257
1258     for my $b (sort keys %{$self->{db_bucketid__}{$userid}}) {
1259         $self->{db_bucketcount__}{$userid}{$b} = 0;
1260         $self->{db_bucketunique__}{$userid}{$b} = 0;
1261     }
1262
1263     while ( my $row = $self->{db_get_bucket_word_counts__}->fetchrow_arrayref ) {
1264         $self->{db_bucketcount__}{$userid}{$row->[1]} = $row->[0];
1265     }
1266
1267     $self->validate_sql_prepare_and_execute( $self->{db_get_bucket_unique_counts__}, $userid );
1268
1269     while ( my $row = $self->{db_get_bucket_unique_counts__}->fetchrow_arrayref ) {
1270         $self->{db_bucketunique__}{$userid}{$row->[1]} = $row->[0];
1271     }
1272
1273     $self->update_constants__( $session );
1274 }
1275
1276 #----------------------------------------------------------------------------
1277 #
1278 # db_get_word_count__
1279 #
1280 # Return the 'count' value for a word in a bucket.  If the word is not
1281 # found in that bucket then returns undef.
1282 #
1283 # $session          Valid session ID from get_session_key
1284 # $bucket           bucket word is in
1285 # $word             word to lookup
1286 #
1287 #----------------------------------------------------------------------------
1288 sub db_get_word_count__
1289 {
1290     my ( $self, $session, $bucket, $word ) = @_;
1291
1292     my $userid = $self->valid_session_key__( $session );
1293     return undef if ( !defined( $userid ) );
1294
1295     $self->validate_sql_prepare_and_execute( $self->{db_get_wordid__}, $word );
1296     my $result = $self->{db_get_wordid__}->fetchrow_arrayref;
1297     if ( !defined( $result ) ) {
1298         return undef;
1299     }
1300
1301     my $wordid = $result->[0];
1302
1303     $self->validate_sql_prepare_and_execute( $self->{db_get_word_count__}, $self->{db_bucketid__}{$userid}{$bucket}{id}, $wordid );
1304     $result = $self->{db_get_word_count__}->fetchrow_arrayref;
1305     if ( defined( $result ) ) {
1306          return $result->[0];
1307     } else {
1308          return undef;
1309     }
1310 }
1311
1312 #----------------------------------------------------------------------------
1313 #
1314 # db_put_word_count__
1315 #
1316 # Update 'count' value for a word in a bucket, if the update fails
1317 # then returns 0 otherwise is returns 1
1318 #
1319 # $session          Valid session ID from get_session_key
1320 # $bucket           bucket word is in
1321 # $word             word to update
1322 # $count            new count value
1323 #
1324 #----------------------------------------------------------------------------
1325 sub db_put_word_count__
1326 {
1327     my ( $self, $session, $bucket, $word, $count ) = @_;
1328
1329     my $userid = $self->valid_session_key__( $session );
1330     return undef if ( !defined( $userid ) );
1331
1332     # We need to have two things before we can start, the id of the
1333     # word in the words table (if there's none then we need to add the
1334     # word), the bucket id in the buckets table (which must exist)
1335
1336     $word = $self->db_quote($word);
1337
1338     my $result = $self->{db__}->selectrow_arrayref(
1339                      "select words.id from words where words.word = $word limit 1;");
1340
1341     if ( !defined( $result ) ) {
1342         $self->{db__}->do( "insert into words ( word ) values ( $word );" );
1343         $result = $self->{db__}->selectrow_arrayref(
1344                      "select words.id from words where words.word = $word limit 1;");
1345     }
1346
1347     my $wordid = $result->[0];
1348     my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
1349
1350     $self->validate_sql_prepare_and_execute( $self->{db_put_word_count__}, $bucketid, $wordid, $count );
1351
1352     return 1;
1353 }
1354
1355 #----------------------------------------------------------------------------
1356 #
1357 # upgrade_predatabase_data__
1358 #
1359 # Looks for old POPFile data (in flat files or BerkeleyDB tables) and
1360 # upgrades it to the SQL database.  Data upgraded is removed.
1361 #
1362 #----------------------------------------------------------------------------
1363 sub upgrade_predatabase_data__
1364 {
1365     my ( $self ) = @_;
1366     my $c      = 0;
1367
1368     # There's an assumption here that this is the single user version
1369     # of POPFile and hence what we do is cheat and get a session key
1370     # assuming that the user name is admin with password ''
1371
1372     my $session = $self->get_session_key( 'admin', '' );
1373
1374     if ( !defined( $session ) ) {
1375         $self->log_( 0, "Tried to get the session key for user admin and failed; cannot upgrade old data" );
1376         return;
1377     }
1378
1379     my @buckets = glob $self->get_user_path_( $self->config_( 'corpus' ) . '/*' );
1380
1381     foreach my $bucket (@buckets) {
1382
1383         # A bucket directory must be a directory
1384
1385         next unless ( -d $bucket );
1386         next unless ( ( -e "$bucket/table" ) || ( -e "$bucket/table.db" ) );
1387
1388         return 0 if ( !$self->upgrade_bucket__( $session, $bucket ) );
1389
1390         my $color = '';
1391
1392         # See if there's a color file specified
1393         if ( open COLOR, '<' . "$bucket/color" ) {
1394             $color = <COLOR>;
1395
1396             # Someone (who shall remain nameless) went in and manually created
1397             # empty color files in their corpus directories which would cause
1398             # $color at this point to be undefined and hence you'd get warnings
1399             # about undefined variables below.  So this little test is to deal
1400             # with that user and to make POPFile a little safer which is always
1401             # a good thing
1402
1403             if ( !defined( $color ) ) {
1404                 $color = '';
1405             } else {
1406                 $color =~ s/[\r\n]//g;
1407             }
1408             close COLOR;
1409             unlink "$bucket/color";
1410         }
1411
1412         $bucket =~ /([[:alpha:]0-9-_]+)$/;
1413         $bucket =  $1;
1414
1415         $self->set_bucket_color( $session, $bucket, ($color eq '')?$self->{possible_colors__}[$c]:$color );
1416
1417         $c = ($c+1) % ($#{$self->{possible_colors__}}+1);
1418     }
1419
1420     $self->release_session_key( $session );
1421
1422     return 1;
1423 }
1424
1425 #----------------------------------------------------------------------------
1426 #
1427 # upgrade_bucket__
1428 #
1429 # Loads an individual bucket
1430 #
1431 # $session           Valid session key from get_session_key
1432 # $bucket            The bucket name
1433 #
1434 #----------------------------------------------------------------------------
1435 sub upgrade_bucket__
1436 {
1437     my ( $self, $session, $bucket ) = @_;
1438
1439     $bucket =~ /([[:alpha:]0-9-_]+)$/;
1440     $bucket =  $1;
1441
1442     $self->create_bucket( $session, $bucket );
1443
1444     if ( open PARAMS, '<' . $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/params" ) ) {
1445         while ( <PARAMS> )  {
1446             s/[\r\n]//g;
1447             if ( /^([[:lower:]]+) ([^\r\n\t ]+)$/ )  {
1448                 $self->set_bucket_parameter( $session, $bucket, $1, $2 );
1449             }
1450         }
1451         close PARAMS;
1452         unlink $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/params" );
1453     }
1454
1455     # Pre v0.21.0 POPFile had GLOBAL parameters for subject modification,
1456     # XTC and XPL insertion.  To make the upgrade as clean as possible
1457     # check these parameters so that if they were OFF we set the equivalent
1458     # per bucket to off
1459
1460     foreach my $gl ( 'subject', 'xtc', 'xpl' ) {
1461         $self->log_( 1, "Checking deprecated parameter GLOBAL_$gl for $bucket\n" );
1462         my $val = $self->{configuration__}->deprecated_parameter( "GLOBAL_$gl" );
1463         if ( defined( $val ) && ( $val == 0 ) ) {
1464             $self->log_( 1, "GLOBAL_$gl is 0 for $bucket, overriding $gl\n" );
1465             $self->set_bucket_parameter( $session, $bucket, $gl, 0 );
1466         }
1467     }
1468
1469     # See if there are magnets defined
1470     if ( open MAGNETS, '<' . $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/magnets" ) ) {
1471         while ( <MAGNETS> )  {
1472             s/[\r\n]//g;
1473
1474             # Because of a bug in v0.17.9 and earlier of POPFile the text of
1475             # some magnets was getting mangled by certain characters having
1476             # a \ prepended.  Code here removes the \ in these cases to make
1477             # an upgrade smooth.
1478
1479             if ( /^([^ ]+) (.+)$/ )  {
1480                 my $type  = $1;
1481                 my $value = $2;
1482
1483                 # Some people were accidently creating magnets with
1484                 # trailing whitespace which really confused them later
1485                 # when their magnet did not match (see comment in
1486                 # UI::HTML::magnet for more detail)
1487
1488                 $value =~ s/^[ \t]+//g;
1489                 $value =~ s/[ \t]+$//g;
1490
1491                 $value =~ s/\\(\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.)/$1/g;
1492                 $self->create_magnet( $session, $bucket, $type, $value );
1493             } else {
1494
1495                 # This branch is used to catch the original magnets in an
1496                 # old version of POPFile that were just there for from
1497                 # addresses only
1498
1499                 if ( /^(.+)$/ ) {
1500                     my $value = $1;
1501                     $value =~ s/\\(\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.)/$1/g;
1502                     $self->create_magnet( $session, $bucket, 'from', $value );
1503                 }
1504             }
1505         }
1506         close MAGNETS;
1507         unlink $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/magnets" );
1508     }
1509
1510     # If there is no existing table but there is a table file (the old style
1511     # flat file used by POPFile for corpus storage) then create the new
1512     # database from it thus performing an automatic upgrade.
1513
1514     if ( -e $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table" ) ) {
1515         $self->log_( 0, "Performing automatic upgrade of $bucket corpus from flat file to DBI" );
1516
1517         $self->{db__}->begin_work;
1518
1519         if ( open WORDS, '<' . $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table" ) )  {
1520
1521             my $wc = 1;
1522
1523             my $first = <WORDS>;
1524             if ( defined( $first ) && ( $first =~ s/^__CORPUS__ __VERSION__ (\d+)// ) ) {
1525                 if ( $1 != $self->{corpus_version__} )  {
1526                     print STDERR "Incompatible corpus version in $bucket\n";
1527                     close WORDS;
1528                     $self->{db__}->rollback;
1529                     return 0;
1530                 } else {
1531                     $self->log_( 0, "Upgrading bucket $bucket..." );
1532
1533                     while ( <WORDS> ) {
1534                         if ( $wc % 100 == 0 ) {
1535                             $self->log_( 0, "$wc" );
1536                         }
1537                         $wc += 1;
1538                         s/[\r\n]//g;
1539
1540                         if ( /^([^\s]+) (\d+)$/ ) {
1541                             if ( $2 != 0 ) {
1542                                 $self->db_put_word_count__( $session, $bucket, $1, $2 );
1543                             }
1544                         } else {
1545                             $self->log_( 0, "Found entry in corpus for $bucket that looks wrong: \"$_\" (ignoring)" );
1546                         }
1547                     }
1548                 }
1549
1550                 if ( $wc > 1 ) {
1551                     $wc -= 1;
1552                     $self->log_( 0, "(completed $wc words)" );
1553                 }
1554                 close WORDS;
1555             } else {
1556                 close WORDS;
1557                 $self->{db__}->rollback;
1558                 unlink $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table" );
1559                 return 0;
1560             }
1561
1562             $self->{db__}->commit;
1563             unlink $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table" );
1564         }
1565     }
1566
1567     # Now check to see if there's a BerkeleyDB-style table
1568
1569     my $bdb_file = $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table.db" );
1570
1571     if ( -e $bdb_file ) {
1572         $self->log_( 0, "Performing automatic upgrade of $bucket corpus from BerkeleyDB to DBI" );
1573
1574         require BerkeleyDB;
1575
1576         my %h;
1577         tie %h, "BerkeleyDB::Hash", -Filename => $bdb_file;
1578
1579         $self->log_( 0, "Upgrading bucket $bucket..." );
1580         $self->{db__}->begin_work;
1581
1582         my $wc = 1;
1583
1584         for my $word (keys %h) {
1585             if ( $wc % 100 == 0 ) {
1586                 $self->log_( 0, "$wc" );
1587             }
1588
1589             next if ( $word =~ /__POPFILE__(LOG__TOTAL|TOTAL|UNIQUE)__/ );
1590
1591             $wc += 1;
1592             if ( $h{$word} != 0 ) {
1593                 $self->db_put_word_count__( $session, $bucket, $word, $h{$word} );
1594             }
1595         }
1596
1597         $wc -= 1;
1598         $self->log_( 0, "(completed $wc words)" );
1599         $self->{db__}->commit;
1600         untie %h;
1601         unlink $bdb_file;
1602     }
1603
1604     return 1;
1605 }
1606
1607 #----------------------------------------------------------------------------
1608 #
1609 # magnet_match_helper__
1610 #
1611 # Helper the determines if a specific string matches a certain magnet
1612 # type in a bucket, used by magnet_match_
1613 #
1614 # $session         Valid session from get_session_key
1615 # $match           The string to match
1616 # $bucket          The bucket to check
1617 # $type            The magnet type to check
1618 #
1619 #----------------------------------------------------------------------------
1620 sub magnet_match_helper__
1621 {
1622     my ( $self, $session, $match, $bucket, $type ) = @_;
1623
1624     my $userid = $self->valid_session_key__( $session );
1625     return undef if ( !defined( $userid ) );
1626
1627     $match = lc($match);
1628
1629     my @magnets;
1630
1631     my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
1632     my $h = $self->validate_sql_prepare_and_execute(                                           # PROFILE BLOCK START
1633         "select magnets.val, magnets.id from magnets, users, buckets, magnet_types
1634              where buckets.id = $bucketid and
1635                    magnets.id != 0 and
1636                    users.id = buckets.userid and
1637                    magnets.bucketid = buckets.id and
1638                    magnet_types.mtype = '$type' and
1639                    magnets.mtid = magnet_types.id order by magnets.val;" );   # PROFILE BLOCK STOP
1640     while ( my $row = $h->fetchrow_arrayref ) {
1641         push @magnets, [$row->[0], $row->[1]];
1642     }
1643     $h->finish;
1644
1645     foreach my $m (@magnets) {
1646         my ( $magnet, $id ) = @{$m};
1647         $magnet = lc($magnet);
1648
1649         for my $i (0..(length($match)-length($magnet))) {
1650             if ( substr( $match, $i, length($magnet)) eq $magnet ) {
1651                 $self->{magnet_used__}   = 1;
1652                 $self->{magnet_detail__} = $id;
1653
1654                 return 1;
1655             }
1656         }
1657     }
1658
1659     return 0;
1660 }
1661
1662 #----------------------------------------------------------------------------
1663 #
1664 # magnet_match__
1665 #
1666 # Helper the determines if a specific string matches a certain magnet
1667 # type in a bucket
1668 #
1669 # $session         Valid session from get_session_key
1670 # $match           The string to match
1671 # $bucket          The bucket to check
1672 # $type            The magnet type to check
1673 #
1674 #----------------------------------------------------------------------------
1675 sub magnet_match__
1676 {
1677     my ( $self, $session, $match, $bucket, $type ) = @_;
1678
1679     return $self->magnet_match_helper__( $session, $match, $bucket, $type );
1680 }
1681
1682 #----------------------------------------------------------------------------
1683 #
1684 # write_line__
1685 #
1686 # Writes a line to a file and parses it unless the classification is
1687 # already known
1688 #
1689 # $file         File handle for file to write line to
1690 # $line         The line to write
1691 # $class        (optional) The current classification
1692 #
1693 #----------------------------------------------------------------------------
1694 sub write_line__
1695 {
1696     my ( $self, $file, $line, $class ) = @_;
1697
1698     print $file $line if defined( $file );
1699
1700     if ( $class eq '' ) {
1701         $self->{parser__}->parse_line( $line );
1702     }
1703 }
1704
1705 #----------------------------------------------------------------------------
1706 #
1707 # add_words_to_bucket__
1708 #
1709 # Takes words previously parsed by the mail parser and adds/subtracts
1710 # them to/from a bucket, this is a helper used by
1711 # add_messages_to_bucket, remove_message_from_bucket
1712 #
1713 # $session        Valid session from get_session_key
1714 # $bucket         Bucket to add to
1715 # $subtract       Set to -1 means subtract the words, set to 1 means add
1716 #
1717 #----------------------------------------------------------------------------
1718 sub add_words_to_bucket__
1719 {
1720     my ( $self, $session, $bucket, $subtract ) = @_;
1721
1722     my $userid = $self->valid_session_key__( $session );
1723     return undef if ( !defined( $userid ) );
1724
1725     # Map the list of words to a list of counts currently in the database
1726     # then update those counts and write them back to the database.
1727
1728     my $words;
1729     $words = join( ',', map( $self->{db__}->quote( $_ ), (sort keys %{$self->{parser__}{words__}}) ) );
1730     $self->{get_wordids__} = $self->validate_sql_prepare_and_execute(        # PROFILE BLOCK START
1731              "select id, word
1732                   from words
1733                   where word in ( $words );" );             # PROFILE BLOCK STOP
1734
1735     my @id_list;
1736     my %wordmap;
1737
1738     while ( my $row = $self->{get_wordids__}->fetchrow_arrayref ) {
1739         push @id_list, ($row->[0]);
1740         $wordmap{$row->[1]} = $row->[0];
1741     }
1742
1743     $self->{get_wordids__}->finish;
1744     undef $self->{get_wordids__};
1745
1746     my $ids = join( ',', @id_list );
1747
1748     $self->{db_getwords__} = $self->validate_sql_prepare_and_execute(                                         # PROFILE BLOCK START
1749              "select matrix.times, matrix.wordid
1750                   from matrix
1751                   where matrix.wordid in ( $ids )
1752                     and matrix.bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};" );  # PROFILE BLOCK STOP
1753
1754     my %counts;
1755
1756     while ( my $row = $self->{db_getwords__}->fetchrow_arrayref ) {
1757         $counts{$row->[1]} = $row->[0];
1758     }
1759
1760     $self->{db_getwords__}->finish;
1761     undef $self->{db_getwords__};
1762
1763     $self->{db__}->begin_work;
1764     foreach my $word (keys %{$self->{parser__}->{words__}}) {
1765
1766         # If there's already a count then it means that the word is
1767         # already in the database and we have its id in
1768         # $wordmap{$word} so for speed we execute the
1769         # db_put_word_count__ query here rather than going through
1770         # set_value_ which would need to look up the wordid again
1771
1772         if ( defined( $wordmap{$word} ) && defined( $counts{$wordmap{$word}} ) ) {
1773             $self->validate_sql_prepare_and_execute( $self->{db_put_word_count__}, $self->{db_bucketid__}{$userid}{$bucket}{id},               # PROFILE BLOCK START
1774                 $wordmap{$word}, $counts{$wordmap{$word}} + $subtract * $self->{parser__}->{words__}{$word} ); # PROFILE BLOCK STOP
1775         } else {
1776
1777             # If the word is not in the database and we are trying to
1778             # subtract then we do nothing because negative values are
1779             # meaningless
1780
1781             if ( $subtract == 1 ) {
1782                 $self->db_put_word_count__( $session, $bucket, $word, $self->{parser__}->{words__}{$word} );
1783             }
1784         }
1785     }
1786
1787     # If we were doing a subtract operation it's possible that some of
1788     # the words in the bucket now have a zero count and should be
1789     # removed
1790
1791     if ( $subtract == -1 ) {
1792         $self->validate_sql_prepare_and_execute( $self->{db_delete_zero_words__}, $self->{db_bucketid__}{$userid}{$bucket}{id} );
1793     }
1794
1795     $self->{db__}->commit;
1796 }
1797
1798 #----------------------------------------------------------------------------
1799 #
1800 # echo_to_dot_
1801 #
1802 # $mail The stream (created with IO::) to send the message to (the
1803 # remote mail server)
1804 # $client (optional) The local mail client (created with IO::) that
1805 # needs the response
1806 # $file (optional) A file to print the response to, caller specifies
1807 # open style
1808 # $before (optional) String to send to client before the dot is sent
1809 #
1810 # echo all information from the $mail server until a single line with
1811 # a . is seen
1812 #
1813 # NOTE Also echoes the line with . to $client but not to $file
1814 #
1815 # Returns 1 if there was a . or 0 if reached EOF before we hit the .
1816 #
1817 #----------------------------------------------------------------------------
1818 sub echo_to_dot_
1819 {
1820     my ( $self, $mail, $client, $file, $before ) = @_;
1821
1822     my $hit_dot = 0;
1823
1824     my $isopen = open FILE, "$file" if ( defined( $file ) );
1825     binmode FILE if ($isopen);
1826
1827     while ( my $line = $self->slurp_( $mail ) ) {
1828
1829         # Check for an abort
1830
1831         last if ( $self->{alive_} == 0 );
1832
1833         # The termination has to be a single line with exactly a dot
1834         # on it and nothing else other than line termination
1835         # characters.  This is vital so that we do not mistake a line
1836         # beginning with . as the end of the block
1837
1838         if ( $line =~ /^\.(\r\n|\r|\n)$/ ) {
1839             $hit_dot = 1;
1840
1841             if ( defined( $before ) && ( $before ne '' ) ) {
1842                 print $client $before if ( defined( $client ) );
1843                 print FILE    $before if ( defined( $isopen ) );
1844             }
1845
1846             # Note that there is no print FILE here.  This is correct
1847             # because we do no want the network terminator . to appear
1848             # in the file version of any message
1849
1850             print $client $line if ( defined( $client ) );
1851             last;
1852         }
1853
1854         print $client $line if ( defined( $client ) );
1855         print FILE    $line if ( defined( $isopen ) );
1856
1857     }
1858
1859     close FILE if ( $isopen );
1860
1861     return $hit_dot;
1862 }
1863
1864 #----------------------------------------------------------------------------
1865 #
1866 # substr_euc__
1867 #
1868 # "substr" function which supports EUC Japanese charset
1869 #
1870 # $pos      Start position
1871 # $len      Word length
1872 #
1873 #----------------------------------------------------------------------------
1874 sub substr_euc__
1875 {
1876     my ( $str, $pos, $len ) = @_;
1877     my $result_str;
1878     my $char;
1879     my $count = 0;
1880     if ( !$pos ) {
1881         $pos = 0;
1882     }
1883     if ( !$len ) {
1884         $len = length( $str );
1885     }
1886
1887     for ( $pos = 0; $count < $len; $pos++ ) {
1888         $char = substr( $str, $pos, 1 );
1889         if ( $char =~ /[\x80-\xff]/ ) {
1890             $char = substr( $str, $pos++, 2 );
1891         }
1892         $result_str .= $char;
1893         $count++;
1894     }
1895
1896     return $result_str;
1897 }
1898
1899 #----------------------------------------------------------------------------
1900 #
1901 # generate_unique_session_key__
1902 #
1903 # Returns a unique string based session key that can be used as a key
1904 # in the api_sessions__
1905 #
1906 #----------------------------------------------------------------------------
1907 sub generate_unique_session_key__
1908 {
1909     my ( $self ) = @_;
1910
1911     my @chars = ( 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',   # PROFILE BLOCK START
1912                   'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'U', 'V', 'W', 'X', 'Y',
1913                   'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A' ); # PROFILE BLOCK STOP
1914
1915     my $session;
1916
1917     do {
1918         $session = '';
1919         my $length = int( 16 + rand(4) );
1920
1921         for my $i (0 .. $length) {
1922             my $random = $chars[int( rand(36) )];
1923
1924             # Just to add spice to things we sometimes lowercase the value
1925
1926             if ( rand(1) < rand(1) ) {
1927                 $random = lc($random);
1928             }
1929
1930             $session .= $random;
1931         }
1932     } while ( defined( $self->{api_sessions__}{$session} ) );
1933
1934     return $session;
1935 }
1936
1937 #----------------------------------------------------------------------------
1938 #
1939 # release_session_key_private__
1940 #
1941 # $session        A session key previously returned by get_session_key
1942 #
1943 # Releases and invalidates the session key. Worker function that does the work
1944 # of release_session_key.
1945 #                   ****DO NOT CALL DIRECTLY****
1946 # unless you want your session key released immediately, possibly preventing
1947 # asynchronous tasks from completing
1948 #
1949 #----------------------------------------------------------------------------
1950 sub release_session_key_private__
1951 {
1952     my ( $self, $session ) = @_;
1953
1954     if ( defined( $self->{api_sessions__}{$session} ) ) {
1955         $self->log_( 1, "release_session_key releasing key $session for user $self->{api_sessions__}{$session}" );
1956         delete $self->{api_sessions__}{$session};
1957     }
1958 }
1959
1960 #----------------------------------------------------------------------------
1961 #
1962 # valid_session_key__
1963 #
1964 # $session                Session key returned by call to get_session_key
1965 #
1966 # Returns undef is the session key is not valid, or returns the user
1967 # ID associated with the session key which can be used in database
1968 # accesses
1969 #
1970 #----------------------------------------------------------------------------
1971 sub valid_session_key__
1972 {
1973     my ( $self, $session ) = @_;
1974
1975     # This provides protection against someone using the XML-RPC
1976     # interface and calling this API directly to fish for session
1977     # keys, this must be called from within this module
1978
1979     return undef if ( caller ne 'Classifier::Bayes' );
1980
1981     # If the session key is invalid then wait 1 second.  This is done
1982     # to prevent people from calling a POPFile API such as
1983     # get_bucket_count with random session keys fishing for a valid
1984     # key.  The XML-RPC API is single threaded and hence this will
1985     # delay all use of that API by one second.  Of course in normal
1986     # use when the user knows the username/password or session key
1987     # then there is no delay
1988
1989     if ( !defined( $self->{api_sessions__}{$session} ) ) {
1990         my ( $package, $filename, $line, $subroutine ) = caller;
1991         $self->log_( 0, "Invalid session key $session provided in $package @ $line" );
1992         select( undef, undef, undef, 1 );
1993     }
1994
1995     return $self->{api_sessions__}{$session};
1996 }
1997
1998 #----------------------------------------------------------------------------
1999 #----------------------------------------------------------------------------
2000 # _____   _____   _____  _______ _____        _______   _______  _____  _____
2001 #|_____] |     | |_____] |______   |   |      |______   |_____| |_____]   |
2002 #|       |_____| |       |       __|__ |_____ |______   |     | |       __|__
2003 #
2004 # The method below are public and may be accessed by other modules.
2005 # All of them may be accessed remotely through the XMLRPC.pm module
2006 # using the XML-RPC protocol
2007 #
2008 # Note that every API function expects to be passed a $session which
2009 # is obtained by first calling get_session_key with a valid username
2010 # and password.  Once done call the method release_session_key.
2011 #
2012 # See POPFile::API for more details
2013 #
2014 #----------------------------------------------------------------------------
2015 #----------------------------------------------------------------------------
2016
2017 #----------------------------------------------------------------------------
2018 #
2019 # get_session_key
2020 #
2021 # $user           The name of an existing user
2022 # $pwd            The user's password
2023 #
2024 # Returns a string based session key if the username and password
2025 # match, or undef if not
2026 #
2027 #----------------------------------------------------------------------------
2028 sub get_session_key
2029 {
2030     my ( $self, $user, $pwd ) = @_;
2031
2032     # The password is stored in the database as an MD5 hash of the
2033     # username and password concatenated and separated by the string
2034     # __popfile__, so compute the hash here
2035
2036     my $hash = md5_hex( $user . '__popfile__' . $pwd );
2037
2038     $self->validate_sql_prepare_and_execute( $self->{db_get_userid__}, $user, $hash );
2039     my $result = $self->{db_get_userid__}->fetchrow_arrayref;
2040     if ( !defined( $result ) ) {
2041
2042         # The delay of one second here is to prevent people from trying out
2043         # username/password combinations at high speed to determine the
2044         # credentials of a valid user
2045
2046         $self->log_( 0, "Attempt to login with incorrect credentials for user $user" );
2047         select( undef, undef, undef, 1 );
2048         return undef;
2049     }
2050
2051     my $session = $self->generate_unique_session_key__();
2052
2053     $self->{api_sessions__}{$session} = $result->[0];
2054
2055     $self->db_update_cache__( $session );
2056
2057     $self->log_( 1, "get_session_key returning key $session for user $self->{api_sessions__}{$session}" );
2058
2059     return $session;
2060 }
2061
2062 #----------------------------------------------------------------------------
2063 #
2064 # release_session_key
2065 #
2066 # $session        A session key previously returned by get_session_key
2067 #
2068 # Releases and invalidates the session key
2069 #
2070 #----------------------------------------------------------------------------
2071 sub release_session_key
2072 {
2073     my ( $self, $session ) = @_;
2074
2075     $self->mq_post_( "RELSE", $session );
2076 }
2077
2078
2079 #----------------------------------------------------------------------------
2080 #
2081 # get_top_bucket__
2082 #
2083 # Helper function used by classify to get the bucket with the highest
2084 # score from data stored in a matrix of information (see definition of
2085 # %matrix in classify for details) and a list of potential buckets
2086 #
2087 # $userid         User ID for database access
2088 # $id             ID of a word in $matrix
2089 # $matrix         Reference to the %matrix hash in classify
2090 # $buckets        Reference to a list of buckets
2091 #
2092 # Returns the bucket in $buckets with the highest score
2093 #
2094 #----------------------------------------------------------------------------
2095 sub get_top_bucket__
2096 {
2097     my ( $self, $userid, $id, $matrix, $buckets ) = @_;
2098
2099     my $best_probability = 0;
2100     my $top_bucket       = 'unclassified';
2101
2102     for my $bucket (@$buckets) {
2103         my $probability = 0;
2104         if ( defined($$matrix{$id}{$bucket}) && ( $$matrix{$id}{$bucket} > 0 ) ) {
2105             $probability = $$matrix{$id}{$bucket} / $self->{db_bucketcount__}{$userid}{$bucket};
2106         }
2107
2108         if ( $probability > $best_probability ) {
2109             $best_probability = $probability;
2110             $top_bucket       = $bucket;
2111         }
2112     }
2113
2114     return $top_bucket;
2115 }
2116
2117 #----------------------------------------------------------------------------
2118 #
2119 # classify
2120 #
2121 # $session   A valid session key returned by a call to get_session_key
2122 # $file The name of the file containing the text to classify (or undef
2123 # to use the data already in the parser)
2124 # $templ     Reference to the UI template used for word score display
2125 # $matrix (optional) Reference to a hash that will be filled with the
2126 # word matrix used in classification
2127 # $idmap (optional) Reference to a hash that will map word ids in the
2128 # $matrix to actual words
2129 #
2130 # Splits the mail message into valid words, then runs the Bayes
2131 # algorithm to figure out which bucket it belongs in.  Returns the
2132 # bucket name
2133 #
2134 #----------------------------------------------------------------------------
2135 sub classify
2136 {
2137     my ( $self, $session, $file, $templ, $matrix, $idmap ) = @_;
2138     my $msg_total = 0;
2139
2140     my $userid = $self->valid_session_key__( $session );
2141     return undef if ( !defined( $userid ) );
2142
2143     $self->{unclassified__} = log( $self->config_( 'unclassified_weight' ) );
2144
2145     $self->{magnet_used__}   = 0;
2146     $self->{magnet_detail__} = 0;
2147
2148     if ( defined( $file ) ) {
2149         $self->{parser__}->parse_file( $file,                                           # PROFILE BLOCK START
2150                                        $self->global_config_( 'message_cutoff'   ) );   # PROFILE BLOCK STOP
2151     }
2152
2153     # Get the list of buckets
2154
2155     my @buckets = $self->get_buckets( $session );
2156
2157     # If the user has not defined any buckets then we escape here
2158     # return unclassified
2159
2160     return "unclassified" if ( $#buckets == -1 );
2161
2162     # Check to see if this email should be classified based on a magnet
2163
2164     for my $bucket ($self->get_buckets_with_magnets( $session ))  {
2165         for my $type ($self->get_magnet_types_in_bucket( $session, $bucket )) {
2166             if ( $self->magnet_match__( $session, $self->{parser__}->get_header($type), $bucket, $type ) ) {
2167                 return $bucket;
2168             }
2169         }
2170     }
2171
2172     # The score hash will contain the likelihood that the given
2173     # message is in each bucket, the buckets are the keys for score
2174
2175     # Set up the initial score as P(bucket)
2176
2177     my %score;
2178     my %matchcount;
2179
2180     # Build up a list of the buckets that are OK to use for
2181     # classification (i.e.  that have at least one word in them).
2182
2183     my @ok_buckets;
2184
2185     for my $bucket (@buckets) {
2186         if ( $self->{bucket_start__}{$userid}{$bucket} != 0 ) {
2187             $score{$bucket} = $self->{bucket_start__}{$userid}{$bucket};
2188             $matchcount{$bucket} = 0;
2189             push @ok_buckets, ( $bucket );
2190         }
2191     }
2192
2193     @buckets = @ok_buckets;
2194
2195     # If the user does not have at least two buckets which contains
2196     # some words then we escape here return unclassified
2197
2198     return "unclassified" if ( $#buckets < 1 );
2199
2200     # For each word go through the buckets and calculate
2201     # P(word|bucket) and then calculate P(word|bucket) ^ word count
2202     # and multiply to the score
2203
2204     my $word_count = 0;
2205
2206     # The correction value is used to generate score displays variable
2207     # which are consistent with the word scores shown by the GUI's
2208     # word lookup feature.  It is computed to make the contribution of
2209     # a word which is unrepresented in a bucket zero.  This correction
2210     # affects only the values displayed in the display; it has no
2211     # effect on the classification process.
2212
2213     my $correction = 0;
2214
2215     # Classification against the database works in a sequence of steps
2216     # to get the fastest time possible.  The steps are as follows:
2217     #
2218     # 1. Convert the list of words returned by the parser into a list
2219     #    of unique word ids that can be used in the database.  This
2220     #    requires a select against the database to get the word ids
2221     #    (and associated words) which is then converted into two
2222     #    things: @id_list which is just the sorted list of word ids
2223     #    and %idmap which maps a word to its id.
2224     #
2225     # 2. Then run a second select that get the triplet (count, id,
2226     #    bucket) for each word id and each bucket.  The triplet
2227     #    contains the word count from the database for each bucket and
2228     #    each id, where there is an entry. That data gets loaded into
2229     #    the sparse matrix %matrix.
2230     #
2231     # 3. Do the normal classification loop as before running against
2232     # the @id_list for the words and for each bucket.  If there's an
2233     # entry in %matrix for the id/bucket combination then calculate
2234     # the probability, otherwise use the not_likely probability.
2235     #
2236     # NOTE.  Since there is a single not_likely probability we do not
2237     # worry about the fact that the select in 1 might return a shorter
2238     # list of words than was found in the message (because some words
2239     # are not in the database) since the missing words will be the
2240     # same for all buckets and hence constitute a fixed scaling factor
2241     # on all the buckets which is irrelevant in deciding which the
2242     # winning bucket is.
2243
2244     my $words;
2245     $words = join( ',', map( $self->{db__}->quote( $_ ), (sort keys %{$self->{parser__}{words__}}) ) );
2246     $self->{get_wordids__} = $self->validate_sql_prepare_and_execute(  # PROFILE BLOCK START
2247              "select id, word
2248                   from words
2249                   where word in ( $words )
2250                   order by id;" );                    # PROFILE BLOCK STOP
2251
2252     my @id_list;
2253     my %temp_idmap;
2254
2255     if ( !defined( $idmap ) ) {
2256         $idmap = \%temp_idmap;
2257     }
2258
2259     while ( my $row = $self->{get_wordids__}->fetchrow_arrayref ) {
2260         push @id_list, ($row->[0]);
2261         $$idmap{$row->[0]} = $row->[1];
2262     }
2263
2264     $self->{get_wordids__}->finish;
2265     undef $self->{get_wordids__};
2266
2267     my $ids = join( ',', @id_list );
2268
2269     $self->{db_classify__} = $self->validate_sql_prepare_and_execute(            # PROFILE BLOCK START
2270              "select matrix.times, matrix.wordid, buckets.name
2271                   from matrix, buckets
2272                   where matrix.wordid in ( $ids )
2273                     and matrix.bucketid = buckets.id
2274                     and buckets.userid = $userid;" );           # PROFILE BLOCK STOP
2275
2276     # %matrix maps wordids and bucket names to counts
2277     # $matrix{$wordid}{$bucket} == $count
2278
2279     my %temp_matrix;
2280
2281     if ( !defined( $matrix ) ) {
2282         $matrix = \%temp_matrix;
2283     }
2284
2285     while ( my $row = $self->{db_classify__}->fetchrow_arrayref ) {
2286         $$matrix{$row->[1]}{$row->[2]} = $row->[0];
2287     }
2288
2289     $self->{db_classify__}->finish;
2290     undef $self->{db_classify__};
2291
2292     foreach my $id (@id_list) {
2293         $word_count += 2;
2294         my $wmax = -10000;
2295
2296         foreach my $bucket (@buckets) {
2297             my $probability = 0;
2298
2299             if ( defined($$matrix{$id}{$bucket}) && ( $$matrix{$id}{$bucket} > 0 ) ) {
2300                 $probability = log( $$matrix{$id}{$bucket} / $self->{db_bucketcount__}{$userid}{$bucket} );
2301             }
2302
2303             $matchcount{$bucket} += $self->{parser__}{words__}{$$idmap{$id}} if ($probability != 0);
2304             $probability = $self->{not_likely__}{$userid} if ( $probability == 0 );
2305             $wmax = $probability if ( $wmax < $probability );
2306             $score{$bucket} += ( $probability * $self->{parser__}{words__}{$$idmap{$id}} );
2307         }
2308
2309         if ($wmax > $self->{not_likely__}{$userid}) {
2310             $correction += $self->{not_likely__}{$userid} * $self->{parser__}{words__}{$$idmap{$id}};
2311         } else {
2312             $correction += $wmax * $self->{parser__}{words__}{$$idmap{$id}};
2313         }
2314     }
2315
2316     # Now sort the scores to find the highest and return that bucket
2317     # as the classification
2318
2319     my @ranking = sort {$score{$b} <=> $score{$a}} keys %score;
2320
2321     my %raw_score;
2322     my $base_score = $score{$ranking[0]};
2323     my $total = 0;
2324
2325     # If the first and second bucket are too close in their
2326     # probabilities, call the message unclassified.  Also if there are
2327     # fewer than 2 buckets.
2328
2329     my $class = 'unclassified';
2330
2331     if ( @buckets > 1 && $score{$ranking[0]} > ( $score{$ranking[1]} + $self->{unclassified__} ) ) {
2332         $class = $ranking[0];
2333     }
2334
2335     # Compute the total of all the scores to generate the normalized
2336     # scores and probability estimate.  $total is always 1 after the
2337     # first loop iteration, so any additional term less than 2 ** -54
2338     # is insignificant, and need not be computed.
2339
2340     my $ln2p_54 = -54 * log(2);
2341
2342     foreach my $b (@ranking) {
2343         $raw_score{$b} = $score{$b};
2344         $score{$b} -= $base_score;
2345
2346         $total += exp($score{$b}) if ($score{$b} > $ln2p_54 );
2347     }
2348
2349     if ($self->{wordscores__} && defined($templ) ) {
2350         my %qm = %{$self->{parser__}->quickmagnets()};
2351         my $mlen = scalar(keys %{$self->{parser__}->quickmagnets()});
2352
2353         if ( $mlen > 0 ) {
2354             $templ->param( 'View_QuickMagnets_If' => 1 );
2355             $templ->param( 'View_QuickMagnets_Count' => ($mlen + 1) );
2356             my @buckets = $self->get_buckets( $session );
2357             my $i = 0;
2358             my %types = $self->get_magnet_types( $session );
2359
2360             my @bucket_data;
2361             foreach my $bucket (@buckets) {
2362                 my %row_data;
2363                 $row_data{View_QuickMagnets_Bucket} = $bucket;
2364                 $row_data{View_QuickMagnets_Bucket_Color} = $self->get_bucket_color( $session, $bucket );
2365                 push ( @bucket_data, \%row_data );
2366             }
2367
2368             my @qm_data;
2369             foreach my $type (sort keys %types) {
2370                 my %row_data;
2371
2372                 if (defined $qm{$type}) {
2373                     $i++;
2374
2375                     $row_data{View_QuickMagnets_Type} = $type;
2376                     $row_data{View_QuickMagnets_I} = $i;
2377                     $row_data{View_QuickMagnets_Loop_Buckets} = \@bucket_data;
2378
2379                     my @magnet_data;
2380                     foreach my $magnet ( @{$qm{$type}} ) {
2381                         my %row_magnet;
2382                         $row_magnet{View_QuickMagnets_Magnet} = $magnet;
2383                         push ( @magnet_data, \%row_magnet );
2384                     }
2385                     $row_data{View_QuickMagnets_Loop_Magnets} = \@magnet_data;
2386
2387                     push ( @qm_data, \%row_data );
2388                 }
2389             }
2390             $templ->param( 'View_QuickMagnets_Loop' => \@qm_data );
2391         }
2392
2393         $templ->param( 'View_Score_If_Score' => $self->{wmformat__} eq 'score' );
2394         my $log10 = log(10.0);
2395
2396         my @score_data;
2397         foreach my $b (@ranking) {
2398              my %row_data;
2399              my $prob = exp($score{$b})/$total;
2400              my $probstr;
2401              my $rawstr;
2402
2403              # If the computed probability would display as 1, display
2404              # it as .999999 instead.  We don't want to give the
2405              # impression that POPFile is ever completely sure of its
2406              # classification.
2407
2408              if ($prob >= .999999) {
2409                  $probstr = sprintf("%12.6f", 0.999999);
2410              } else {
2411                  if ($prob >= 0.1 || $prob == 0.0) {
2412                      $probstr = sprintf("%12.6f", $prob);
2413                  } else {
2414                     $probstr = sprintf("%17.6e", $prob);
2415                  }
2416              }
2417
2418              my $color = $self->get_bucket_color( $session, $b );
2419
2420              $row_data{View_Score_Bucket} = $b;
2421              $row_data{View_Score_Bucket_Color} = $color;
2422              $row_data{View_Score_MatchCount} = $matchcount{$b};
2423              $row_data{View_Score_ProbStr} = $probstr;
2424
2425              if ($self->{wmformat__} eq 'score') {
2426                  $row_data{View_Score_If_Score} = 1;
2427                  $rawstr = sprintf("%12.6f", ($raw_score{$b} - $correction)/$log10);
2428                  $row_data{View_Score_RawStr} = $rawstr;
2429              }
2430              push ( @score_data, \%row_data );
2431         }
2432         $templ->param( 'View_Score_Loop_Scores' => \@score_data );
2433
2434         if ( $self->{wmformat__} ne '' ) {
2435             $templ->param( 'View_Score_If_Table' => 1 );
2436
2437             my @header_data;
2438             foreach my $ix (0..($#buckets > 7? 7: $#buckets)) {
2439                 my %row_data;
2440                 my $bucket = $ranking[$ix];
2441                 my $bucketcolor  = $self->get_bucket_color( $session, $bucket );
2442                 $row_data{View_Score_Bucket} = $bucket;
2443                 $row_data{View_Score_Bucket_Color} = $bucketcolor;
2444                 push ( @header_data, \%row_data );
2445             }
2446             $templ->param( 'View_Score_Loop_Bucket_Header' => \@header_data );
2447
2448             my %wordprobs;
2449
2450             # If the word matrix is supposed to show probabilities,
2451             # compute them, saving the results in %wordprobs.
2452
2453             if ( $self->{wmformat__} eq 'prob') {
2454                 foreach my $id (@id_list) {
2455                     my $sumfreq = 0;
2456                     my %wval;
2457                     foreach my $bucket (@ranking) {
2458                         $wval{$bucket} = $$matrix{$id}{$bucket} || 0;
2459                         $sumfreq += $wval{$bucket};
2460                     }
2461
2462                     # If $sumfreq is still zero then this word didn't
2463                     # appear in any buckets so we shouldn't create
2464                     # wordprobs entries for it
2465
2466                     if ( $sumfreq != 0 ) {
2467                         foreach my $bucket (@ranking) {
2468                             $wordprobs{$bucket,$id} = $wval{$bucket} / $sumfreq;
2469                         }
2470                     }
2471                 }
2472             }
2473
2474             my @ranked_ids;
2475             if ($self->{wmformat__} eq 'prob') {
2476                 @ranked_ids = sort {($wordprobs{$ranking[0],$b}||0) <=> ($wordprobs{$ranking[0],$a}||0)} @id_list;
2477             } else {
2478                 @ranked_ids = sort {($$matrix{$b}{$ranking[0]}||0) <=> ($$matrix{$a}{$ranking[0]}||0)} @id_list;
2479             }
2480
2481             my @word_data;
2482             my %chart;
2483             foreach my $id (@ranked_ids) {
2484                 my %row_data;
2485                 my $known = 0;
2486
2487                 foreach my $bucket (@ranking) {
2488                     if ( defined( $$matrix{$id}{$bucket} ) ) {
2489                         $known = 1;
2490                         last;
2491                     }
2492                 }
2493
2494                 if ( $known == 1 ) {
2495                     my $wordcolor = $self->get_bucket_color( $session, $self->get_top_bucket__( $userid, $id, $matrix, \@ranking ) );
2496                     my $count = $self->{parser__}->{words__}{$$idmap{$id}};
2497
2498                     $row_data{View_Score_Word} = $$idmap{$id};
2499                     $row_data{View_Score_Word_Color} = $wordcolor;
2500                     $row_data{View_Score_Word_Count} = $count;
2501
2502                     my $base_probability = 0;
2503                     if ( defined($$matrix{$id}{$ranking[0]}) && ( $$matrix{$id}{$ranking[0]} > 0 ) ) {
2504                         $base_probability = log( $$matrix{$id}{$ranking[0]} / $self->{db_bucketcount__}{$userid}{$ranking[0]} );
2505                     }
2506
2507                     my @per_bucket;
2508                     my @score;
2509                     foreach my $ix (0..($#buckets > 7? 7: $#buckets)) {
2510                         my %bucket_row;
2511                         my $bucket = $ranking[$ix];
2512                         my $probability = 0;
2513                         if ( defined($$matrix{$id}{$bucket}) && ( $$matrix{$id}{$bucket} > 0 ) ) {
2514                             $probability = log( $$matrix{$id}{$bucket} / $self->{db_bucketcount__}{$userid}{$bucket} );
2515                         }
2516                         my $color = 'black';
2517
2518                         if ( $probability >= $base_probability || $base_probability == 0 ) {
2519                             $color = $self->get_bucket_color( $session, $bucket );
2520                         }
2521
2522                         $bucket_row{View_Score_If_Probability} = ( $probability != 0 );
2523                         $bucket_row{View_Score_Word_Color} = $color;
2524                         if ( $probability != 0 ) {
2525                             my $wordprobstr;
2526                             if ($self->{wmformat__} eq 'score') {
2527                                 $wordprobstr  = sprintf("%12.4f", ($probability - $self->{not_likely__}{$userid})/$log10 );
2528                                 push ( @score, $wordprobstr );
2529                             } else {
2530                                 if ($self->{wmformat__} eq 'prob') {
2531                                     $wordprobstr  = sprintf("%12.4f", $wordprobs{$bucket,$id});
2532                                 } else {
2533                                     $wordprobstr  = sprintf("%13.5f", exp($probability) );
2534                                 }
2535                             }
2536                             $bucket_row{View_Score_Probability} = $wordprobstr;
2537                         }
2538                         else {
2539                             # Scores eq 0 must also be remembered.
2540                             push @score, 0;
2541                         }
2542                         push ( @per_bucket, \%bucket_row );
2543                     }
2544                     $row_data{View_Score_Loop_Per_Bucket} = \@per_bucket;
2545
2546                     # If we are doing the word scores then we build up
2547                     # a hash that maps the name of a word to a value
2548                     # which is the difference between the word scores
2549                     # for the top two buckets.  We later use this to
2550                     # draw a chart
2551
2552                     if ( $self->{wmformat__} eq 'score' ) {
2553                         $chart{$$idmap{$id}} = ( $score[0] || 0 ) - ( $score[1] || 0 );
2554                     }
2555
2556                     push ( @word_data, \%row_data );
2557                 }
2558             }
2559             $templ->param( 'View_Score_Loop_Words' => \@word_data );
2560
2561             if ( $self->{wmformat__} eq 'score' ) {
2562                 # Draw a chart that shows how the decision between the top
2563                 # two buckets was made.
2564
2565                 my @words = sort { $chart{$b} <=> $chart{$a} } keys %chart;
2566
2567                 my @chart_data;
2568                 my $max_chart = $chart{$words[0]};
2569                 my $min_chart = $chart{$words[$#words]};
2570                 my $scale = ( $max_chart > $min_chart ) ? 400 / ( $max_chart - $min_chart ) : 0;
2571
2572                 my $color_1 = $self->get_bucket_color( $session, $ranking[0] );
2573                 my $color_2 = $self->get_bucket_color( $session, $ranking[1] );
2574
2575                 $templ->param( 'Bucket_1' => $ranking[0] );
2576                 $templ->param( 'Bucket_2' => $ranking[1] );
2577
2578                 $templ->param( 'Color_Bucket_1' => $color_1 );
2579                 $templ->param( 'Color_Bucket_2' => $color_2 );
2580
2581                 $templ->param( 'Score_Bucket_1' => sprintf("%.3f", ($raw_score{$ranking[0]} - $correction)/$log10) );
2582                 $templ->param( 'Score_Bucket_2' => sprintf("%.3f", ($raw_score{$ranking[1]} - $correction)/$log10) );
2583
2584                 for ( my $i=0; $i <= $#words; $i++ ) {
2585                     my $word_1 = $words[$i];
2586                     my $word_2 = $words[$#words - $i];
2587
2588                     my $width_1 = int( $chart{$word_1} * $scale + .5 );
2589                     my $width_2 = int( $chart{$word_2} * $scale - .5 ) * -1;
2590
2591                     last if ( $width_1 <=0 && $width_2 <= 0 );
2592
2593                     my %row_data;
2594
2595                     $row_data{View_Chart_Word_1} = $word_1;
2596                     if ( $width_1 > 0 ) {
2597                         $row_data{View_If_Bar_1} = 1;
2598                         $row_data{View_Width_1}  = $width_1;
2599                         $row_data{View_Color_1}  = $color_1;
2600                         $row_data{Score_Word_1}  = sprintf "%.3f", $chart{$word_1};
2601                     }
2602                     else {
2603                         $row_data{View_If_Bar_1} = 0;
2604                     }
2605
2606                     $row_data{View_Chart_Word_2} = $word_2;
2607                     if ( $width_2 > 0 ) {
2608                         $row_data{View_If_Bar_2} = 1;
2609                         $row_data{View_Width_2}  = $width_2;
2610                         $row_data{View_Color_2}  = $color_2;
2611                         $row_data{Score_Word_2}  = sprintf "%.3f", $chart{$word_2};
2612                     }
2613                     else {
2614                         $row_data{View_If_Bar_2} = 0;
2615                     }
2616
2617                     push ( @chart_data, \%row_data );
2618                 }
2619                 $templ->param( 'View_Loop_Chart' => \@chart_data );
2620                 $templ->param( 'If_chart' => 1 );
2621             }
2622             else {
2623                 $templ->param( 'If_chart' => 0 );
2624             }
2625         }
2626     }
2627
2628     return $class;
2629 }
2630
2631 #----------------------------------------------------------------------------
2632 #
2633 # classify_and_modify
2634 #
2635 # This method reads an email terminated by . on a line by itself (or
2636 # the end of stream) from a handle and creates an entry in the
2637 # history, outputting the same email on another handle with the
2638 # appropriate header modifications and insertions
2639 #
2640 # $session  - A valid session key returned by a call to get_session_key
2641 # $mail     - an open stream to read the email from
2642 # $client   - an open stream to write the modified email to
2643 # $nosave   - set to 1 indicates that this should not save to history
2644 # $class    - if we already know the classification
2645 # $slot     - Must be defined if $class is set
2646 # $echo     - 1 to echo to the client, 0 to supress, defaults to 1
2647 # $crlf     - The sequence to use at the end of a line in the output,
2648 #   normally this is left undefined and this method uses $eol (the
2649 #   normal network end of line), but if this method is being used with
2650 #   real files you may wish to pass in \n instead
2651 #
2652 # Returns a classification if it worked and the slot ID of the history
2653 # item related to this classification
2654 #
2655 # IMPORTANT NOTE: $mail and $client should be binmode
2656 #
2657 #----------------------------------------------------------------------------
2658 sub classify_and_modify
2659 {
2660     my ( $self, $session, $mail, $client, $nosave, $class, $slot, $echo, $crlf ) = @_;
2661
2662     $echo = 1    unless (defined $echo);
2663     $crlf = $eol unless (defined $crlf);
2664
2665     my $msg_subject;              # The message subject
2666     my $msg_head_before = '';     # Store the message headers that
2667                                   # come before Subject here
2668     my $msg_head_after = '';      # Store the message headers that
2669                                   # come after Subject here
2670     my $msg_head_q      = '';     # Store questionable header lines here
2671     my $msg_body        = '';     # Store the message body here
2672     my $in_subject_header = 0;    # 1 if in Subject header
2673
2674     # These two variables are used to control the insertion of the
2675     # X-POPFile-TimeoutPrevention header when downloading long or slow
2676     # emails
2677
2678     my $last_timeout   = time;
2679     my $timeout_count  = 0;
2680
2681     # Indicates whether the first time through the receive loop we got
2682     # the full body, this will happen on small emails
2683
2684     my $got_full_body  = 0;
2685
2686     # The size of the message downloaded so far.
2687
2688     my $message_size   = 0;
2689
2690     # The classification for this message
2691
2692     my $classification = '';
2693
2694     # Whether we are currently reading the mail headers or not
2695
2696     my $getting_headers = 1;
2697
2698     my $msg_file;
2699
2700     # If we don't yet know the classification then start the parser
2701
2702     $class = '' if ( !defined( $class ) );
2703     if ( $class eq '' ) {
2704         $self->{parser__}->start_parse();
2705         ( $slot, $msg_file ) = $self->{history__}->reserve_slot();
2706     } else {
2707         $msg_file = $self->{history__}->get_slot_file( $slot );
2708     }
2709
2710     # We append .TMP to the filename for the MSG file so that if we are in
2711     # middle of downloading a message and we refresh the history we do not
2712     # get class file errors
2713
2714     open MSG, ">$msg_file" unless $nosave;
2715
2716     while ( my $line = $self->slurp_( $mail ) ) {
2717         my $fileline;
2718
2719         # This is done so that we remove the network style end of line
2720         # CR LF and allow Perl to decide on the local system EOL which
2721         # it will expand out of \n when this gets written to the temp
2722         # file
2723
2724         $fileline = $line;
2725         $fileline =~ s/[\r\n]//g;
2726         $fileline .= "\n";
2727
2728         # Check for an abort
2729
2730         last if ( $self->{alive_} == 0 );
2731
2732         # The termination of a message is a line consisting of exactly
2733         # .CRLF so we detect that here exactly
2734
2735         if ( $line =~ /^\.(\r\n|\r|\n)$/ ) {
2736             $got_full_body = 1;
2737             last;
2738         }
2739
2740         if ( $getting_headers )  {
2741
2742             # Kill header lines containing only whitespace (Exim does this)
2743
2744             next if ( $line =~ /^[ \t]+(\r\n|\r|\n)$/i );
2745
2746             if ( !( $line =~ /^(\r\n|\r|\n)$/i ) )  {
2747                 $message_size += length $line;
2748                 $self->write_line__( $nosave?undef:\*MSG, $fileline, $class );
2749
2750                 # If there is no echoing occuring, it doesn't matter
2751                 # what we do to these
2752
2753                 if ( $echo ) {
2754                     if ( $line =~ /^Subject:(.*)/i )  {
2755                         $msg_subject = $1;
2756                         $msg_subject =~ s/(\012|\015)//g;
2757                         $in_subject_header = 1;
2758                         next;
2759                     } elsif ( $line !~ /^[ \t]/ ) {
2760                         $in_subject_header = 0;
2761                     }
2762
2763                     # Strip out the X-Text-Classification header that
2764                     # is in an incoming message
2765
2766                     next if ( $line =~ /^X-Text-Classification:/i );
2767                     next if ( $line =~ /^X-POPFile-Link:/i );
2768
2769                     # Store any lines that appear as though they may
2770                     # be non-header content Lines that are headers
2771                     # begin with whitespace or Alphanumerics and "-"
2772                     # followed by a colon.
2773                     #
2774                     # This prevents weird things like HTML before the
2775                     # headers terminate from causing the XPL and XTC
2776                     # headers to be inserted in places some clients
2777                     # can't detect
2778
2779                     if ( ( $line =~ /^[ \t]/ ) && $in_subject_header ) {
2780                         $line =~ s/(\012|\015)//g;
2781                         $msg_subject .= $crlf . $line;
2782                         next;
2783                     }
2784
2785                     if ( $line =~ /^([ \t]|([A-Z0-9\-_]+:))/i ) {
2786                         if ( !defined($msg_subject) )  {
2787                             $msg_head_before .= $msg_head_q . $line;
2788                         } else {
2789                             $msg_head_after  .= $msg_head_q . $line;
2790                         }
2791                         $msg_head_q = '';
2792                     } else {
2793
2794                         # Gather up any header lines that are questionable
2795
2796                         $self->log_( 1, "Found odd email header: $line" );
2797                         $msg_head_q .= $line;
2798                     }
2799                 }
2800             } else {
2801                 $self->write_line__( $nosave?undef:\*MSG, "\n", $class );
2802                 $message_size += length $crlf;
2803                 $getting_headers = 0;
2804             }
2805         } else {
2806             $message_size += length $line;
2807             $msg_body     .= $line;
2808             $self->write_line__( $nosave?undef:\*MSG, $fileline, $class );
2809         }
2810
2811         # Check to see if too much time has passed and we need to keep
2812         # the mail client happy
2813
2814         if ( time > ( $last_timeout + 2 ) ) {
2815             print $client "X-POPFile-TimeoutPrevention: $timeout_count$crlf" if ( $echo );
2816             $timeout_count += 1;
2817             $last_timeout = time;
2818         }
2819
2820         last if ( ( $message_size > $self->global_config_( 'message_cutoff' ) ) && ( $getting_headers == 0 ) );
2821     }
2822
2823     close MSG unless $nosave;
2824
2825     # If we don't yet know the classification then stop the parser
2826     if ( $class eq '' ) {
2827         $self->{parser__}->stop_parse();
2828     }
2829
2830     # Do the text classification and update the counter for that
2831     # bucket that we just downloaded an email of that type
2832
2833     $classification = ($class ne '')?$class:$self->classify( $session, undef);
2834
2835     my $subject_modification = $self->get_bucket_parameter( $session, $classification, 'subject'    );
2836     my $xtc_insertion        = $self->get_bucket_parameter( $session, $classification, 'xtc'        );
2837     my $xpl_insertion        = $self->get_bucket_parameter( $session, $classification, 'xpl'        );
2838     my $quarantine           = $self->get_bucket_parameter( $session, $classification, 'quarantine' );
2839
2840     my $modification = $self->config_( 'subject_mod_left' ) . $classification . $self->config_( 'subject_mod_right' );
2841
2842     # Add the Subject line modification or the original line back again
2843     # Don't add the classification unless it is not present
2844
2845     my $original_msg_subject = $msg_subject;
2846
2847     if ( $subject_modification ) {
2848         if ( !defined( $msg_subject ) ) {   # PROFILE BLOCK START
2849             $msg_subject = " $modification";
2850         } elsif ( $msg_subject !~ /\Q$modification\E/ ) {
2851             $msg_subject = " $modification$msg_subject";
2852         }                                   # PROFILE BLOCK STOP
2853     }
2854
2855     if ( $quarantine ) {
2856         if ( defined( $original_msg_subject ) ) {
2857             $msg_head_before .= "Subject:$original_msg_subject$crlf";
2858         }
2859     } else {
2860         if ( defined( $msg_subject ) ) {
2861             $msg_head_before .= "Subject:$msg_subject$crlf";
2862         }
2863     }
2864
2865     # Add LF if $msg_head_after ends with CR to avoid header concatination
2866
2867     $msg_head_after =~ s/\015\z/$eol/;
2868
2869     # Add the XTC header
2870
2871     if ( ( $xtc_insertion ) && ( !$quarantine ) ) {
2872         $msg_head_after .= "X-Text-Classification: $classification$crlf";
2873     }
2874
2875     # Add the XPL header
2876
2877     my $xpl;
2878
2879     if ( $xpl_insertion ) {
2880
2881         my $host = $self->module_config_( 'html', 'local' ) ?   # PROFILE BLOCK START
2882                 $self->config_( 'localhostname' ) || '127.0.0.1' :
2883                 $self->config_( 'hostname' );                   # PROFILE BLOCK STOP
2884         my $port = $self->module_config_( 'html', 'port' );
2885
2886         $xpl = "http://$host:$port/jump_to_message?view=$slot";
2887
2888         $xpl = "<$xpl>" if ( $self->config_( 'xpl_angle' ) );
2889
2890         if ( !$quarantine ) {
2891             $msg_head_after .= "X-POPFile-Link: $xpl$crlf";
2892         }
2893     }
2894
2895     $msg_head_after .= $msg_head_q;
2896     $msg_head_after .= $crlf if ( !$getting_headers );
2897
2898     # Echo the text of the message to the client
2899
2900     if ( $echo ) {
2901
2902         # If the bucket is quarantined then we'll treat it specially
2903         # by changing the message header to contain information from
2904         # POPFile and wrapping the original message in a MIME encoding
2905
2906        if ( $quarantine == 1 ) {
2907            my ( $orig_from, $orig_to, $orig_subject ) = ( $self->{parser__}->get_header('from'), $self->{parser__}->get_header('to'), $self->{parser__}->get_header('subject') );
2908            my ( $encoded_from, $encoded_to ) = ( $orig_from, $orig_to );
2909            if ( $self->{parser__}->{lang__} eq 'Nihongo' ) {
2910                require Encode;
2911
2912                Encode::from_to( $orig_from, 'euc-jp', 'iso-2022-jp');
2913                Encode::from_to( $orig_to, 'euc-jp', 'iso-2022-jp');
2914                Encode::from_to( $orig_subject, 'euc-jp', 'iso-2022-jp');
2915
2916                $encoded_from = $orig_from;
2917                $encoded_to = $orig_to;
2918                $encoded_from =~ s/(\x1B\x24\x42.+\x1B\x28\x42)/"=?ISO-2022-JP?B?" . encode_base64($1,'') . "?="/eg;
2919                $encoded_to =~ s/(\x1B\x24\x42.+\x1B\x28\x42)/"=?ISO-2022-JP?B?" . encode_base64($1,'') . "?="/eg;
2920            }
2921
2922            print $client "From: $encoded_from$crlf";
2923            print $client "To: $encoded_to$crlf";
2924            print $client "Date: " . $self->{parser__}->get_header( 'date' ) . "$crlf";
2925            print $client "Subject:$msg_subject$crlf" if ( defined( $msg_subject ) );
2926            print $client "X-Text-Classification: $classification$crlf" if ( $xtc_insertion );
2927            print $client "X-POPFile-Link: $xpl$crlf" if ( $xpl_insertion );
2928            print $client "MIME-Version: 1.0$crlf";
2929            print $client "Content-Type: multipart/report; boundary=\"$slot\"$crlf$crlf--$slot$crlf";
2930            print $client "Content-Type: text/plain";
2931            print $client "; charset=iso-2022-jp" if ( $self->{parser__}->{lang__} eq 'Nihongo' );
2932            print $client "$crlf$crlf";
2933            print $client "POPFile has quarantined a message.  It is attached to this email.$crlf$crlf";
2934            print $client "Quarantined Message Detail$crlf$crlf";
2935
2936            print $client "Original From: $orig_from$crlf";
2937            print $client "Original To: $orig_to$crlf";
2938            print $client "Original Subject: $orig_subject$crlf";
2939
2940            print $client "To examine the email open the attachment. ";
2941            print $client "To change this mail's classification go to $xpl$crlf";
2942            print $client "$crlf";
2943            print $client "The first 20 words found in the email are:$crlf$crlf";
2944
2945            my $first20 = $self->{parser__}->first20();
2946            if ( $self->{parser__}->{lang__} eq 'Nihongo' ) {
2947                require Encode;
2948
2949                Encode::from_to( $first20, 'euc-jp', 'iso-2022-jp');
2950            }
2951
2952            print $client $first20;
2953            print $client "$crlf--$slot$crlf";
2954            print $client "Content-Type: message/rfc822$crlf$crlf";
2955         }
2956
2957         print $client $msg_head_before;
2958         print $client $msg_head_after;
2959         print $client $msg_body;
2960     }
2961
2962     my $before_dot = '';
2963
2964     if ( $quarantine && $echo ) {
2965         $before_dot = "$crlf--$slot--$crlf";
2966     }
2967
2968     my $need_dot = 0;
2969
2970     if ( $got_full_body ) {
2971         $need_dot = 1;
2972     } else {
2973         $need_dot = !$self->echo_to_dot_( $mail, $echo?$client:undef, $nosave?undef:'>>' . $msg_file, $before_dot ) && !$nosave;
2974     }
2975
2976     if ( $need_dot ) {
2977         print $client $before_dot if ( $before_dot ne '' );
2978         print $client ".$crlf"    if ( $echo );
2979     }
2980
2981     # In some cases it's possible (and totally illegal) to get a . in
2982     # the middle of the message, to cope with the we call flush_extra_
2983     # here to remove any extra stuff the POP3 server is sending Make
2984     # sure to supress output if we are not echoing, and to save to
2985     # file if not echoing and saving
2986
2987     if ( !($nosave || $echo) ) {
2988
2989         # if we're saving (not nosave) and not echoing, we can safely
2990         # unload this into the temp file
2991
2992         if (open FLUSH, ">$msg_file.flush") {
2993             binmode FLUSH;
2994
2995             # TODO: Do this in a faster way (without flushing to one
2996             # file then copying to another) (perhaps a select on $mail
2997             # to predict if there is flushable data)
2998
2999             $self->flush_extra_( $mail, \*FLUSH, 0);
3000             close FLUSH;
3001
3002             # append any data we got to the actual temp file
3003
3004             if ( ( (-s "$msg_file.flush") > 0 ) && ( open FLUSH, "<$msg_file.flush" ) ) {
3005                 binmode FLUSH;
3006                 if ( open TEMP, ">>$msg_file" ) {
3007                     binmode TEMP;
3008
3009                     # The only time we get data here is if it is after
3010                     # a CRLF.CRLF We have to re-create it to avoid
3011                     # data-loss
3012
3013                     print TEMP ".$crlf";
3014
3015                     print TEMP $_ while (<FLUSH>);
3016
3017                     # NOTE: The last line flushed MAY be a CRLF.CRLF,
3018                     # which isn't actually part of the message body
3019
3020                     close TEMP;
3021                 }
3022                 close FLUSH;
3023             }
3024             unlink("$msg_file.flush");
3025         }
3026     } else {
3027
3028         # if we are echoing, the client can make sure we have no data
3029         # loss otherwise, the data can be discarded (not saved and not
3030         # echoed)
3031
3032         $self->flush_extra_( $mail, $client, $echo?0:1);
3033     }
3034
3035     if ( $class eq '' ) {
3036         if ( $nosave ) {
3037             $self->{history__}->release_slot( $slot );
3038         } else {
3039             $self->{history__}->commit_slot( $session, $slot, $classification, $self->{magnet_detail__} );
3040         }
3041     }
3042
3043     return ( $classification, $slot, $self->{magnet_used__} );
3044 }
3045
3046 #----------------------------------------------------------------------------
3047 #
3048 # get_buckets
3049 #
3050 # Returns a list containing all the real bucket names sorted into
3051 # alphabetic order
3052 #
3053 # $session   A valid session key returned by a call to get_session_key
3054 #
3055 #----------------------------------------------------------------------------
3056 sub get_buckets
3057 {
3058     my ( $self, $session ) = @_;
3059
3060     my $userid = $self->valid_session_key__( $session );
3061     return undef if ( !defined( $userid ) );
3062
3063     # Note that get_buckets does not return pseudo buckets
3064
3065     my @buckets;
3066
3067     for my $b (sort keys %{$self->{db_bucketid__}{$userid}}) {
3068         if ( $self->{db_bucketid__}{$userid}{$b}{pseudo} == 0 ) {
3069             push @buckets, ($b);
3070         }
3071     }
3072
3073     return @buckets;
3074 }
3075
3076 #----------------------------------------------------------------------------
3077 #
3078 # get_bucket_id
3079 #
3080 # Returns the internal ID for a bucket for database calls
3081 #
3082 # $session   A valid session key returned by a call to get_session_key
3083 # $bucket    The bucket name
3084 #
3085 #----------------------------------------------------------------------------
3086 sub get_bucket_id
3087 {
3088     my ( $self, $session, $bucket ) = @_;
3089
3090     my $userid = $self->valid_session_key__( $session );
3091     return undef if ( !defined( $userid ) );
3092
3093     return $self->{db_bucketid__}{$userid}{$bucket}{id};
3094 }
3095
3096 #----------------------------------------------------------------------------
3097 #
3098 # get_bucket_name
3099 #
3100 # Returns the name of a bucket from an internal ID
3101 #
3102 # $session   A valid session key returned by a call to get_session_key
3103 # $id        The bucket id
3104 #
3105 #----------------------------------------------------------------------------
3106 sub get_bucket_name
3107 {
3108     my ( $self, $session, $id ) = @_;
3109
3110     my $userid = $self->valid_session_key__( $session );
3111     return undef if ( !defined( $userid ) );
3112
3113     foreach $b (keys %{$self->{db_bucketid__}{$userid}}) {
3114         if ( $id == $self->{db_bucketid__}{$userid}{$b}{id} ) {
3115             return $b;
3116         }
3117     }
3118
3119     return '';
3120 }
3121
3122 #----------------------------------------------------------------------------
3123 #
3124 # get_pseudo_buckets
3125 #
3126 # Returns a list containing all the pseudo bucket names sorted into
3127 # alphabetic order
3128 #
3129 # $session   A valid session key returned by a call to get_session_key
3130 #
3131 #----------------------------------------------------------------------------
3132 sub get_pseudo_buckets
3133 {
3134     my ( $self, $session ) = @_;
3135
3136     my $userid = $self->valid_session_key__( $session );
3137     return undef if ( !defined( $userid ) );
3138
3139     my @buckets;
3140
3141     for my $b (sort keys %{$self->{db_bucketid__}{$userid}}) {
3142         if ( $self->{db_bucketid__}{$userid}{$b}{pseudo} == 1 ) {
3143             push @buckets, ($b);
3144         }
3145     }
3146
3147     return @buckets;
3148 }
3149
3150 #----------------------------------------------------------------------------
3151 #
3152 # get_all_buckets
3153 #
3154 # Returns a list containing all the bucket names sorted into
3155 # alphabetic order
3156 #
3157 # $session   A valid session key returned by a call to get_session_key
3158 #
3159 #----------------------------------------------------------------------------
3160 sub get_all_buckets
3161 {
3162     my ( $self, $session ) = @_;
3163
3164     my $userid = $self->valid_session_key__( $session );
3165     return undef if ( !defined( $userid ) );
3166
3167     my @buckets;
3168
3169     for my $b (sort keys %{$self->{db_bucketid__}{$userid}}) {
3170          push @buckets, ($b);
3171     }
3172
3173     return @buckets;
3174 }
3175
3176 #----------------------------------------------------------------------------
3177 #
3178 # is_pseudo_bucket
3179 #
3180 # Returns 1 if the named bucket is pseudo
3181 #
3182 # $session   A valid session key returned by a call to get_session_key
3183 # $bucket    The bucket to check
3184 #
3185 #----------------------------------------------------------------------------
3186 sub is_pseudo_bucket
3187 {
3188     my ( $self, $session, $bucket ) = @_;
3189
3190     my $userid = $self->valid_session_key__( $session );
3191     return undef if ( !defined( $userid ) );
3192
3193     return ( defined($self->{db_bucketid__}{$userid}{$bucket})   # PROFILE BLOCK START
3194           && $self->{db_bucketid__}{$userid}{$bucket}{pseudo} ); # PROFILE BLOCK STOP
3195 }
3196
3197 #----------------------------------------------------------------------------
3198 #
3199 # is_bucket
3200 #
3201 # Returns 1 if the named bucket is a bucket
3202 #
3203 # $session   A valid session key returned by a call to get_session_key
3204 # $bucket    The bucket to check
3205 #
3206 #----------------------------------------------------------------------------
3207 sub is_bucket
3208 {
3209     my ( $self, $session, $bucket ) = @_;
3210
3211     my $userid = $self->valid_session_key__( $session );
3212     return undef if ( !defined( $userid ) );
3213
3214     return ( ( defined( $self->{db_bucketid__}{$userid}{$bucket} ) ) &&  # PROFILE BLOCK START
3215              ( !$self->{db_bucketid__}{$userid}{$bucket}{pseudo} ) );    # PROFILE BLOCK STOP
3216 }
3217
3218 #----------------------------------------------------------------------------
3219 #
3220 # get_bucket_word_count
3221 #
3222 # Returns the total word count (including duplicates) for the passed in bucket
3223 #
3224 # $session     A valid session key returned by a call to get_session_key
3225 # $bucket      The name of the bucket for which the word count is desired
3226 #
3227 #----------------------------------------------------------------------------
3228 sub get_bucket_word_count
3229 {
3230     my ( $self, $session, $bucket ) = @_;
3231
3232     my $userid = $self->valid_session_key__( $session );
3233     return undef if ( !defined( $userid ) );
3234
3235     my $c = $self->{db_bucketcount__}{$userid}{$bucket};
3236
3237     return defined($c)?$c:0;
3238 }
3239
3240 #----------------------------------------------------------------------------
3241 #
3242 # get_bucket_word_list
3243 #
3244 # Returns a list of words all with the same first character
3245 #
3246 # $session     A valid session key returned by a call to get_session_key
3247 # $bucket      The name of the bucket for which the word count is desired
3248 # $prefix      The first character of the words
3249 #
3250 #----------------------------------------------------------------------------
3251 sub get_bucket_word_list
3252 {
3253     my ( $self, $session, $bucket, $prefix ) = @_;
3254
3255     my $userid = $self->valid_session_key__( $session );
3256     return undef if ( !defined( $userid ) );
3257
3258     my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
3259     my $result = $self->{db__}->selectcol_arrayref(  # PROFILE BLOCK START
3260         "select words.word from matrix, words
3261          where matrix.wordid  = words.id and
3262                matrix.bucketid = $bucketid and
3263                words.word like '$prefix%';");        # PROFILE BLOCK STOP
3264
3265     return @{$result};
3266 }
3267
3268 #----------------------------------------------------------------------------
3269 #
3270 # get_bucket_word_prefixes
3271 #
3272 # Returns a list of all the initial letters of words in a bucket
3273 #
3274 # $session     A valid session key returned by a call to get_session_key
3275 # $bucket      The name of the bucket for which the word count is desired
3276 #
3277 #----------------------------------------------------------------------------
3278 sub get_bucket_word_prefixes
3279 {
3280     my ( $self, $session, $bucket ) = @_;
3281
3282     my $userid = $self->valid_session_key__( $session );
3283     return undef if ( !defined( $userid ) );
3284
3285     my $prev = '';
3286
3287     my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
3288     my $result = $self->{db__}->selectcol_arrayref(   # PROFILE BLOCK START
3289         "select words.word from matrix, words
3290          where matrix.wordid  = words.id and
3291                matrix.bucketid = $bucketid;");        # PROFILE BLOCK STOP
3292
3293     if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) {
3294         return grep {$_ ne $prev && ($prev = $_, 1)} sort map {substr_euc__($_,0,1)} @{$result};
3295     } else {
3296         if  ( $self->module_config_( 'html', 'language' ) eq 'Korean' ) {
3297             return grep {$_ ne $prev && ($prev = $_, 1)} sort map {$_ =~ /([\x20-\x80]|$eksc)/} @{$result};
3298         } else {
3299             return grep {$_ ne $prev && ($prev = $_, 1)} sort map {substr($_,0,1)}  @{$result};
3300         }
3301     }
3302 }
3303
3304 #----------------------------------------------------------------------------
3305 #
3306 # get_word_count
3307 #
3308 # Returns the total word count (including duplicates)
3309 #
3310 # $session   A valid session key returned by a call to get_session_key
3311 #
3312 #----------------------------------------------------------------------------
3313 sub get_word_count
3314 {
3315     my ( $self, $session ) = @_;
3316
3317     my $userid = $self->valid_session_key__( $session );
3318     return undef if ( !defined( $userid ) );
3319
3320     $self->validate_sql_prepare_and_execute( $self->{db_get_full_total__}, $userid );
3321     return $self->{db_get_full_total__}->fetchrow_arrayref->[0] || 0;
3322 }
3323
3324 #----------------------------------------------------------------------------
3325 #
3326 # get_count_for_word
3327 #
3328 # Returns the number of times the word occurs in a bucket
3329 #
3330 # $session         A valid session key returned by a call to get_session_key
3331 # $bucket          The bucket we are asking about
3332 # $word            The word we are asking about
3333 #
3334 #----------------------------------------------------------------------------
3335 sub get_count_for_word
3336 {
3337     my ( $self, $session, $bucket, $word ) = @_;
3338
3339     my $userid = $self->valid_session_key__( $session );
3340     return undef if ( !defined( $userid ) );
3341
3342     return $self->get_base_value_( $session, $bucket, $word );
3343 }
3344
3345 #----------------------------------------------------------------------------
3346 #
3347 # get_bucket_unique_count
3348 #
3349 # Returns the unique word count (excluding duplicates) for the passed
3350 # in bucket
3351 #
3352 # $session     A valid session key returned by a call to get_session_key
3353 # $bucket      The name of the bucket for which the word count is desired
3354 #
3355 #----------------------------------------------------------------------------
3356 sub get_bucket_unique_count
3357 {
3358     my ( $self, $session, $bucket ) = @_;
3359
3360     my $userid = $self->valid_session_key__( $session );
3361     return undef if ( !defined( $userid ) );
3362
3363     my $c = $self->{db_bucketunique__}{$userid}{$bucket};
3364
3365     return defined($c)?$c:0;
3366 }
3367
3368 #----------------------------------------------------------------------------
3369 #
3370 # get_unique_word_count
3371 #
3372 # Returns the unique word count (excluding duplicates) for all buckets
3373 #
3374 # $session   A valid session key returned by a call to get_session_key
3375 #
3376 #----------------------------------------------------------------------------
3377 sub get_unique_word_count
3378 {
3379     my ( $self, $session ) = @_;
3380
3381     my $userid = $self->valid_session_key__( $session );
3382     return undef if ( !defined( $userid ) );
3383
3384     $self->validate_sql_prepare_and_execute( $self->{db_get_unique_word_count__}, $userid );
3385     return $self->{db_get_unique_word_count__}->fetchrow_arrayref->[0];
3386 }
3387
3388 #----------------------------------------------------------------------------
3389 #
3390 # get_bucket_color
3391 #
3392 # Returns the color associated with a bucket
3393 #
3394 # $session   A valid session key returned by a call to get_session_key
3395 # $bucket      The name of the bucket for which the color is requested
3396 #
3397 # NOTE  This API is DEPRECATED in favor of calling get_bucket_parameter for
3398 #       the parameter named 'color'
3399 #----------------------------------------------------------------------------
3400 sub get_bucket_color
3401 {
3402     my ( $self, $session, $bucket ) = @_;
3403
3404     return $self->get_bucket_parameter( $session, $bucket, 'color' );
3405 }
3406
3407 #----------------------------------------------------------------------------
3408 #
3409 # set_bucket_color
3410 #
3411 # Returns the color associated with a bucket
3412 #
3413 # $session     A valid session key returned by a call to get_session_key
3414 # $bucket      The name of the bucket for which the color is requested
3415 # $color       The new color
3416 #
3417 # NOTE  This API is DEPRECATED in favor of calling set_bucket_parameter for
3418 #       the parameter named 'color'
3419 #----------------------------------------------------------------------------
3420 sub set_bucket_color
3421 {
3422     my ( $self, $session, $bucket, $color ) = @_;
3423
3424     return $self->set_bucket_parameter( $session, $bucket, 'color', $color );
3425 }
3426
3427 #----------------------------------------------------------------------------
3428 #
3429 # get_bucket_parameter
3430 #
3431 # Returns the value of a per bucket parameter
3432 #
3433 # $session     A valid session key returned by a call to get_session_key
3434 # $bucket      The name of the bucket
3435 # $parameter   The name of the parameter
3436 #
3437 #----------------------------------------------------------------------------
3438 sub get_bucket_parameter
3439 {
3440     my ( $self, $session, $bucket, $parameter ) = @_;
3441
3442     my $userid = $self->valid_session_key__( $session );
3443     return undef if ( !defined( $userid ) );
3444
3445     # See if there's a cached value
3446
3447     if ( defined( $self->{db_parameters__}{$userid}{$bucket}{$parameter} ) ) {
3448         return $self->{db_parameters__}{$userid}{$bucket}{$parameter};
3449     }
3450
3451     # Make sure that the bucket passed in actually exists
3452
3453     if ( !defined( $self->{db_bucketid__}{$userid}{$bucket} ) ) {
3454         return undef;
3455     }
3456
3457     # Make sure that the parameter is valid
3458
3459     if ( !defined( $self->{db_parameterid__}{$parameter} ) ) {
3460         return undef;
3461     }
3462
3463     # If there is a non-default value for this parameter then return it.
3464
3465     $self->validate_sql_prepare_and_execute( $self->{db_get_bucket_parameter__},
3466         $self->{db_bucketid__}{$userid}{$bucket}{id},
3467         $self->{db_parameterid__}{$parameter} );
3468     my $result = $self->{db_get_bucket_parameter__}->fetchrow_arrayref;
3469
3470     # If this parameter has not been defined for this specific bucket then
3471     # get the default value
3472
3473     if ( !defined( $result ) ) {
3474         $self->validate_sql_prepare_and_execute( $self->{db_get_bucket_parameter_default__},  # PROFILE BLOCK START
3475             $self->{db_parameterid__}{$parameter} );          # PROFILE BLOCK STOP
3476         $result = $self->{db_get_bucket_parameter_default__}->fetchrow_arrayref;
3477     }
3478
3479     if ( defined( $result ) ) {
3480         $self->{db_parameters__}{$userid}{$bucket}{$parameter} = $result->[0];
3481         return $result->[0];
3482     } else {
3483         return undef;
3484     }
3485 }
3486
3487 #----------------------------------------------------------------------------
3488 #
3489 # set_bucket_parameter
3490 #
3491 # Sets the value associated with a bucket specific parameter
3492 #
3493 # $session     A valid session key returned by a call to get_session_key
3494 # $bucket      The name of the bucket
3495 # $parameter   The name of the parameter
3496 # $value       The new value
3497 #
3498 #----------------------------------------------------------------------------
3499 sub set_bucket_parameter
3500 {
3501     my ( $self, $session, $bucket, $parameter, $value ) = @_;
3502
3503     my $userid = $self->valid_session_key__( $session );
3504     return undef if ( !defined( $userid ) );
3505
3506     # Make sure that the bucket passed in actually exists
3507
3508     if ( !defined( $self->{db_bucketid__}{$userid}{$bucket} ) ) {
3509         return undef;
3510     }
3511
3512     my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
3513     my $btid     = $self->{db_parameterid__}{$parameter};
3514
3515     # Exactly one row should be affected by this statement
3516
3517     $self->validate_sql_prepare_and_execute( $self->{db_set_bucket_parameter__}, $bucketid, $btid, $value );
3518
3519     if ( defined( $self->{db_parameters__}{$userid}{$bucket}{$parameter} ) ) {
3520         $self->{db_parameters__}{$userid}{$bucket}{$parameter} = $value;
3521     }
3522
3523     return 1;
3524 }
3525
3526 #----------------------------------------------------------------------------
3527 #
3528 # get_html_colored_message
3529 #
3530 # Parser a mail message stored in a file and returns HTML representing
3531 # the message with coloring of the words
3532 #
3533 # $session        A valid session key returned by a call to get_session_key
3534 # $file           The file to parse
3535 #
3536 #----------------------------------------------------------------------------
3537 sub get_html_colored_message
3538 {
3539     my ( $self, $session, $file ) = @_;
3540
3541     my $userid = $self->valid_session_key__( $session );
3542     return undef if ( !defined( $userid ) );
3543
3544     $self->{parser__}->{color__} = $session;
3545     $self->{parser__}->{color_matrix__} = undef;
3546     $self->{parser__}->{color_idmap__}  = undef;
3547     $self->{parser__}->{color_userid__} = undef;
3548     $self->{parser__}->{bayes__} = bless $self;
3549
3550     my $result = $self->{parser__}->parse_file( $file,   # PROFILE BLOCK START
3551            $self->global_config_( 'message_cutoff'   ) ); # PROFILE BLOCK STOP
3552
3553     $self->{parser__}->{color__} = '';
3554
3555     return $result;
3556 }
3557
3558 #----------------------------------------------------------------------------
3559 #
3560 # fast_get_html_colored_message
3561 #
3562 # Parser a mail message stored in a file and returns HTML representing the message
3563 # with coloring of the words
3564 #
3565 # $session        A valid session key returned by a call to get_session_key
3566 # $file           The file to colorize
3567 # $matrix         Reference to the matrix hash from a call to classify
3568 # $idmap          Reference to the idmap hash from a call to classify
3569 #
3570 #----------------------------------------------------------------------------
3571 sub fast_get_html_colored_message
3572 {
3573     my ( $self, $session, $file, $matrix, $idmap ) = @_;
3574
3575     my $userid = $self->valid_session_key__( $session );
3576     return undef if ( !defined( $userid ) );
3577
3578     $self->{parser__}->{color__}        = $session;
3579     $self->{parser__}->{color_matrix__} = $matrix;
3580     $self->{parser__}->{color_idmap__}  = $idmap;
3581     $self->{parser__}->{color_userid__} = $userid;
3582     $self->{parser__}->{bayes__}        = bless $self;
3583
3584     my $result = $self->{parser__}->parse_file( $file,
3585                                                 $self->global_config_( 'message_cutoff'   ) );
3586
3587     $self->{parser__}->{color__} = '';
3588
3589     return $result;
3590 }
3591
3592 #----------------------------------------------------------------------------
3593 #
3594 # create_bucket
3595 #
3596 # Creates a new bucket, returns 1 if the creation succeeded
3597 #
3598 # $session     A valid session key returned by a call to get_session_key
3599 # $bucket      Name for the new bucket
3600 #
3601 #----------------------------------------------------------------------------
3602 sub create_bucket
3603 {
3604     my ( $self, $session, $bucket ) = @_;
3605
3606     if ( $self->is_bucket( $session, $bucket ) ||           # PROFILE BLOCK START
3607          $self->is_pseudo_bucket( $session, $bucket ) ) {   # PROFILE BLOCK STOP
3608         return 0;
3609     }
3610
3611     my $userid = $self->valid_session_key__( $session );
3612     return undef if ( !defined( $userid ) );
3613
3614     $bucket = $self->{db__}->quote( $bucket );
3615
3616     $self->{db__}->do(                                                                    # PROFILE BLOCK START
3617         "insert into buckets ( name, pseudo, userid ) values ( $bucket, 0, $userid );" ); # PROFILE BLOCK STOP
3618     $self->db_update_cache__( $session );
3619
3620     return 1;
3621 }
3622
3623 #----------------------------------------------------------------------------
3624 #
3625 # delete_bucket
3626 #
3627 # Deletes a bucket, returns 1 if the delete succeeded
3628 #
3629 # $session     A valid session key returned by a call to get_session_key
3630 # $bucket      Name of the bucket to delete
3631 #
3632 #----------------------------------------------------------------------------
3633 sub delete_bucket
3634 {
3635     my ( $self, $session, $bucket ) = @_;
3636
3637     my $userid = $self->valid_session_key__( $session );
3638     return undef if ( !defined( $userid ) );
3639
3640     # Make sure that the bucket passed in actually exists
3641
3642     if ( !defined( $self->{db_bucketid__}{$userid}{$bucket} ) ) {
3643         return 0;
3644     }
3645
3646     $self->{db__}->do(                                                                        # PROFILE BLOCK START
3647         "delete from buckets where buckets.userid = $userid and buckets.name = '$bucket';" ); # PROFILE BLOCK STOP
3648     $self->db_update_cache__( $session );
3649
3650     return 1;
3651 }
3652
3653 #----------------------------------------------------------------------------
3654 #
3655 # rename_bucket
3656 #
3657 # Renames a bucket, returns 1 if the rename succeeded
3658 #
3659 # $session             A valid session key returned by a call to get_session_key
3660 # $old_bucket          The old name of the bucket
3661 # $new_bucket          The new name of the bucket
3662 #
3663 #----------------------------------------------------------------------------
3664 sub rename_bucket
3665 {
3666     my ( $self, $session, $old_bucket, $new_bucket ) = @_;
3667
3668     my $userid = $self->valid_session_key__( $session );
3669     return undef if ( !defined( $userid ) );
3670
3671     # Make sure that the bucket passed in actually exists
3672
3673     if ( !defined( $self->{db_bucketid__}{$userid}{$old_bucket} ) ) {
3674         $self->log_( 0, "Bad bucket name $old_bucket to rename_bucket" );
3675         return 0;
3676     }
3677
3678     my $id = $self->{db__}->quote( $self->{db_bucketid__}{$userid}{$old_bucket}{id} );
3679     $new_bucket = $self->{db__}->quote( $new_bucket );
3680
3681     $self->log_( 1, "Rename bucket $old_bucket to $new_bucket" );
3682
3683     my $result = $self->{db__}->do( "update buckets set name = $new_bucket where id = $id;" );
3684
3685     if ( !defined( $result ) || ( $result == -1 ) ) {
3686         return 0;
3687     } else {
3688         $self->db_update_cache__( $session );
3689         return 1;
3690     }
3691 }
3692
3693 #----------------------------------------------------------------------------
3694 #
3695 # add_messages_to_bucket
3696 #
3697 # Parses mail messages and updates the statistics in the specified bucket
3698 #
3699 # $session         A valid session key returned by a call to get_session_key
3700 # $bucket          Name of the bucket to be updated
3701 # @files           List of file names to parse
3702 #
3703 #----------------------------------------------------------------------------
3704 sub add_messages_to_bucket
3705 {
3706     my ( $self, $session, $bucket, @files ) = @_;
3707
3708     my $userid = $self->valid_session_key__( $session );
3709     return undef if ( !defined( $userid ) );
3710
3711     if ( !defined( $self->{db_bucketid__}{$userid}{$bucket}{id} ) ) {
3712         return 0;
3713     }
3714
3715     # This is done to clear out the word list because in the loop
3716     # below we are going to not reset the word list on each parse
3717
3718     $self->{parser__}->start_parse();
3719     $self->{parser__}->stop_parse();
3720
3721     foreach my $file (@files) {
3722         $self->{parser__}->parse_file( $file,  # PROFILE BLOCK START
3723             $self->global_config_( 'message_cutoff'   ),
3724             0 );  # PROFILE BLOCK STOP (Do not reset word list)
3725     }
3726
3727     $self->add_words_to_bucket__( $session, $bucket, 1 );
3728     $self->db_update_cache__( $session );
3729
3730     return 1;
3731 }
3732
3733 #----------------------------------------------------------------------------
3734 #
3735 # add_message_to_bucket
3736 #
3737 # Parses a mail message and updates the statistics in the specified bucket
3738 #
3739 # $session         A valid session key returned by a call to get_session_key
3740 # $bucket          Name of the bucket to be updated
3741 # $file            Name of file containing mail message to parse
3742 #
3743 #----------------------------------------------------------------------------
3744 sub add_message_to_bucket
3745 {
3746     my ( $self, $session, $bucket, $file ) = @_;
3747
3748     my $userid = $self->valid_session_key__( $session );
3749     return undef if ( !defined( $userid ) );
3750
3751     return $self->add_messages_to_bucket( $session, $bucket, $file );
3752 }
3753
3754 #----------------------------------------------------------------------------
3755 #
3756 # remove_message_from_bucket
3757 #
3758 # Parses a mail message and updates the statistics in the specified bucket
3759 #
3760 # $session         A valid session key returned by a call to get_session_key
3761 # $bucket          Name of the bucket to be updated
3762 # $file            Name of file containing mail message to parse
3763 #
3764 #----------------------------------------------------------------------------
3765 sub remove_message_from_bucket
3766 {
3767     my ( $self, $session, $bucket, $file ) = @_;
3768
3769     my $userid = $self->valid_session_key__( $session );
3770     return undef if ( !defined( $userid ) );
3771
3772     $self->{parser__}->parse_file( $file,               # PROFILE BLOCK START
3773          $self->global_config_( 'message_cutoff'   ) ); # PROFILE BLOCK STOP
3774     $self->add_words_to_bucket__( $session, $bucket, -1 );
3775
3776     $self->db_update_cache__( $session );
3777
3778     return 1;
3779 }
3780
3781 #----------------------------------------------------------------------------
3782 #
3783 # get_buckets_with_magnets
3784 #
3785 # Returns the names of the buckets for which magnets are defined
3786 #
3787 # $session     A valid session key returned by a call to get_session_key
3788 #
3789 #----------------------------------------------------------------------------
3790 sub get_buckets_with_magnets
3791 {
3792     my ( $self, $session ) = @_;
3793
3794     my $userid = $self->valid_session_key__( $session );
3795     return undef if ( !defined( $userid ) );
3796
3797     my @result;
3798
3799     $self->validate_sql_prepare_and_execute( $self->{db_get_buckets_with_magnets__}, $userid );
3800     while ( my $row = $self->{db_get_buckets_with_magnets__}->fetchrow_arrayref ) {
3801         push @result, ($row->[0]);
3802     }
3803
3804     return @result;
3805 }
3806
3807 #----------------------------------------------------------------------------
3808 #
3809 # get_magnet_types_in_bucket
3810 #
3811 # Returns the types of the magnets in a specific bucket
3812 #
3813 # $session     A valid session key returned by a call to get_session_key
3814 # $bucket      The bucket to search for magnets
3815 #
3816 #----------------------------------------------------------------------------
3817 sub get_magnet_types_in_bucket
3818 {
3819     my ( $self, $session, $bucket ) = @_;
3820
3821     my $userid = $self->valid_session_key__( $session );
3822     return undef if ( !defined( $userid ) );
3823
3824     my @result;
3825
3826     my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
3827     my $h = $self->validate_sql_prepare_and_execute( "select magnet_types.mtype from magnet_types, magnets, buckets
3828         where magnet_types.id = magnets.mtid and
3829               magnets.bucketid = buckets.id and
3830               buckets.id = $bucketid
3831               group by magnet_types.mtype
3832               order by magnet_types.mtype;" );
3833
3834     while ( my $row = $h->fetchrow_arrayref ) {
3835         push @result, ($row->[0]);
3836     }
3837     $h->finish;
3838
3839     return @result;
3840 }
3841
3842 #----------------------------------------------------------------------------
3843 #
3844 # clear_bucket
3845 #
3846 # Removes all words from a bucket
3847 #
3848 # $session        A valid session key returned by a call to get_session_key
3849 # $bucket         The bucket to clear
3850 #
3851 #----------------------------------------------------------------------------
3852 sub clear_bucket
3853 {
3854     my ( $self, $session, $bucket ) = @_;
3855
3856     my $userid = $self->valid_session_key__( $session );
3857     return undef if ( !defined( $userid ) );
3858
3859     my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
3860
3861     $self->{db__}->do( "delete from matrix where matrix.bucketid = $bucketid;" );
3862     $self->db_update_cache__( $session );
3863 }
3864
3865 #----------------------------------------------------------------------------
3866 #
3867 # clear_magnets
3868 #
3869 # Removes every magnet currently defined
3870 #
3871 # $session     A valid session key returned by a call to get_session_key
3872 #
3873 #----------------------------------------------------------------------------
3874 sub clear_magnets
3875 {
3876     my ( $self, $session ) = @_;
3877
3878     my $userid = $self->valid_session_key__( $session );
3879     return undef if ( !defined( $userid ) );
3880
3881     for my $bucket (keys %{$self->{db_bucketid__}{$userid}}) {
3882         my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
3883         $self->{db__}->do( "delete from magnets where magnets.bucketid = $bucketid;" );
3884     }
3885 }
3886
3887 #----------------------------------------------------------------------------
3888 #
3889 # get_magnets
3890 #
3891 # Returns the magnets of a certain type in a bucket
3892 #
3893 # $session         A valid session key returned by a call to get_session_key
3894 # $bucket          The bucket to search for magnets
3895 # $type            The magnet type (e.g. from, to or subject)
3896 #
3897 #----------------------------------------------------------------------------
3898 sub get_magnets
3899 {
3900     my ( $self, $session, $bucket, $type ) = @_;
3901
3902     my $userid = $self->valid_session_key__( $session );
3903     return undef if ( !defined( $userid ) );
3904
3905     my @result;
3906
3907     my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
3908     my $h = $self->validate_sql_prepare_and_execute( "select magnets.val from magnets, magnet_types
3909         where magnets.bucketid = $bucketid and
3910               magnets.id != 0 and
3911               magnet_types.id = magnets.mtid and
3912               magnet_types.mtype = '$type' order by magnets.val;" );
3913
3914     while ( my $row = $h->fetchrow_arrayref ) {
3915         push @result, ($row->[0]);
3916     }
3917     $h->finish;
3918
3919     return @result;
3920 }
3921
3922 #----------------------------------------------------------------------------
3923 #
3924 # create_magnet
3925 #
3926 # Make a new magnet
3927 #
3928 # $session         A valid session key returned by a call to get_session_key
3929 # $bucket          The bucket the magnet belongs in
3930 # $type            The magnet type (e.g. from, to or subject)
3931 # $text            The text of the magnet
3932 #
3933 #----------------------------------------------------------------------------
3934 sub create_magnet
3935 {
3936     my ( $self, $session, $bucket, $type, $text ) = @_;
3937
3938     my $userid = $self->valid_session_key__( $session );
3939     return undef if ( !defined( $userid ) );
3940
3941     my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
3942     my $result = $self->{db__}->selectrow_arrayref("select magnet_types.id from magnet_types
3943                                                         where magnet_types.mtype = '$type';" );
3944
3945     my $mtid = $result->[0];
3946
3947     $text = $self->{db__}->quote( $text );
3948
3949     $self->{db__}->do( "insert into magnets ( bucketid, mtid, val )
3950                                      values ( $bucketid, $mtid, $text );" );
3951 }
3952
3953 #----------------------------------------------------------------------------
3954 #
3955 # get_magnet_types
3956 #
3957 # Get a hash mapping magnet types (e.g. from) to magnet names (e.g. From);
3958 #
3959 # $session     A valid session key returned by a call to get_session_key
3960 #
3961 #----------------------------------------------------------------------------
3962 sub get_magnet_types
3963 {
3964     my ( $self, $session ) = @_;
3965
3966     my $userid = $self->valid_session_key__( $session );
3967     return undef if ( !defined( $userid ) );
3968
3969     my %result;
3970
3971     my $h = $self->validate_sql_prepare_and_execute( "select magnet_types.mtype, magnet_types.header from magnet_types order by mtype;" );
3972
3973     while ( my $row = $h->fetchrow_arrayref ) {
3974         $result{$row->[0]} = $row->[1];
3975     }
3976     $h->finish;
3977
3978     return %result;
3979 }
3980
3981 #----------------------------------------------------------------------------
3982 #
3983 # delete_magnet
3984 #
3985 # Remove a new magnet
3986 #
3987 # $session         A valid session key returned by a call to get_session_key
3988 # $bucket          The bucket the magnet belongs in
3989 # $type            The magnet type (e.g. from, to or subject)
3990 # $text            The text of the magnet
3991 #
3992 #----------------------------------------------------------------------------
3993 sub delete_magnet
3994 {
3995     my ( $self, $session, $bucket, $type, $text ) = @_;
3996
3997     my $userid = $self->valid_session_key__( $session );
3998     return undef if ( !defined( $userid ) );
3999
4000     my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
4001     my $result = $self->{db__}->selectrow_arrayref("select magnet_types.id from magnet_types
4002                                                         where magnet_types.mtype = '$type';" );
4003
4004     my $mtid = $result->[0];
4005     $text = $self->{db__}->quote( $text );
4006
4007     $self->{db__}->do( "delete from magnets
4008                             where magnets.bucketid = $bucketid and
4009                                   magnets.mtid = $mtid and
4010                                   magnets.val  = $text;" );
4011 }
4012
4013 #----------------------------------------------------------------------------
4014 #
4015 # get_stopword_list
4016 #
4017 # Gets the complete list of stop words
4018 #
4019 # $session     A valid session key returned by a call to get_session_key
4020 #
4021 #----------------------------------------------------------------------------
4022 sub get_stopword_list
4023 {
4024     my ( $self, $session ) = @_;
4025
4026     my $userid = $self->valid_session_key__( $session );
4027     return undef if ( !defined( $userid ) );
4028
4029     return $self->{parser__}->{mangle__}->stopwords();
4030 }
4031
4032 #----------------------------------------------------------------------------
4033 #
4034 # magnet_count
4035 #
4036 # Gets the number of magnets that are defined
4037 #
4038 # $session     A valid session key returned by a call to get_session_key
4039 #
4040 #----------------------------------------------------------------------------
4041 sub magnet_count
4042 {
4043     my ( $self, $session ) = @_;
4044
4045     my $userid = $self->valid_session_key__( $session );
4046     return undef if ( !defined( $userid ) );
4047
4048     my $result = $self->{db__}->selectrow_arrayref( "select count(*) from magnets, buckets
4049         where buckets.userid = $userid and
4050               magnets.id != 0 and
4051               magnets.bucketid = buckets.id;" );
4052
4053     if ( defined( $result ) ) {
4054         return $result->[0];
4055     } else {
4056         return 0;
4057     }
4058 }
4059
4060 #----------------------------------------------------------------------------
4061 #
4062 # add_stopword, remove_stopword
4063 #
4064 # Adds or removes a stop word
4065 #
4066 # $session     A valid session key returned by a call to get_session_key
4067 # $stopword    The word to add or remove
4068 #
4069 # Return 0 for a bad stop word, and 1 otherwise
4070 #
4071 #----------------------------------------------------------------------------
4072 sub add_stopword
4073 {
4074     my ( $self, $session, $stopword ) = @_;
4075
4076     my $userid = $self->valid_session_key__( $session );
4077     return undef if ( !defined( $userid ) );
4078
4079     # Pass language parameter to add_stopword()
4080
4081     return $self->{parser__}->{mangle__}->add_stopword( $stopword, $self->module_config_( 'html', 'language' ) );
4082 }
4083
4084 sub remove_stopword
4085 {
4086     my ( $self, $session, $stopword ) = @_;
4087
4088     my $userid = $self->valid_session_key__( $session );
4089     return undef if ( !defined( $userid ) );
4090
4091     # Pass language parameter to remove_stopword()
4092
4093     return $self->{parser__}->{mangle__}->remove_stopword( $stopword, $self->module_config_( 'html', 'language' ) );
4094 }
4095
4096
4097 #----------------------------------------------------------------------------
4098 #
4099 # db_quote
4100 #
4101 # Quote a string for use in a sql statement. Before calling DBI::quote on the
4102 # string the string is also checked for any null-bytes.
4103 #
4104 # $string   The string that should be quoted.
4105 #
4106 # returns the quoted string without any possible null-bytes
4107 #----------------------------------------------------------------------------
4108 sub db_quote {
4109     my $self   = shift;
4110     my $string = shift;
4111
4112     my $backup = $string;
4113     if ( $string =~ s/\x00//g ) {
4114         my ( $package, $file, $line ) = caller;
4115         $self->log_( 0, "Found null-byte in string '$backup'. Called from package '$package' ($file), line $line." );
4116     }
4117
4118     return $self->{db__}->quote( $string );
4119 }
4120
4121
4122 #----------------------------------------------------------------------------
4123 #
4124 # validate_sql_prepare_and_execute
4125 #
4126 # This method will prepare sql statements and execute them.
4127 # The statement itself and any binding parameters are also
4128 # tested for possible null-characters (\x00).
4129 # If you pass in a handle to a prepared statement, the statement
4130 # will be executed and possible binding-parameters are checked.
4131 #
4132 # $statement  The sql statement to prepare or the prepared statement handle
4133 # @args       The (optional) list of binding parameters
4134 #
4135 # Returns the result of prepare()
4136 #----------------------------------------------------------------------------
4137 sub validate_sql_prepare_and_execute {
4138     my $self = shift;
4139     my $sql_or_sth  = shift;
4140     my @args = @_;
4141
4142     my $dbh = $self->db();
4143     my $sth = undef;
4144
4145     # Is this a statement-handle or a sql string?
4146     if ( (ref $sql_or_sth) =~ m/^DBI::/ ) {
4147         $sth = $sql_or_sth;
4148     }
4149     else {
4150         my $sql = $sql_or_sth;
4151         $sql = $self->check_for_nullbytes( $sql );
4152         $sth = $dbh->prepare( $sql );
4153     }
4154
4155     my $execute_result = undef;
4156
4157     # Any binding-params?
4158     if ( @args ) {
4159         foreach my $arg ( @args ) {
4160             $arg = $self->check_for_nullbytes( $arg );
4161         }
4162         $execute_result = $sth->execute( @args );
4163     }
4164     else {
4165         $execute_result = $sth->execute();
4166     }
4167
4168     unless ( $execute_result ) {
4169         my ( $package, $file, $line ) = caller;
4170         $self->log_( 0, "DBI::execute failed.  Called from package '$package' ($file), line $line." );
4171     }
4172
4173     return $sth;
4174 }
4175
4176
4177 #----------------------------------------------------------------------------
4178 #
4179 # check_for_nullbytes
4180 #
4181 # Will check a passed-in string for possible null-bytes and log and error
4182 # message in case a null-byte is found.
4183 #
4184 # Will return the string with any null-bytes removed.
4185 #----------------------------------------------------------------------------
4186 sub check_for_nullbytes {
4187     my $self = shift;
4188     my $string = shift;
4189
4190     if ( defined $string ) {
4191         my $backup = $string;
4192
4193         if ( my $count = ( $string =~ s/\x00//g ) ) {
4194             my ( $package, $file, $line ) = caller( 1 );
4195             $self->log_( 0, "Found $count null-character(s) in string '$backup'. Called from package '$package' ($file), line $line." );
4196         }
4197     }
4198
4199     return $string;
4200 }
4201
4202
4203 #----------------------------------------------------------------------------
4204 #----------------------------------------------------------------------------
4205 # _____   _____   _____  _______ _____        _______   _______  _____  _____
4206 #|_____] |     | |_____] |______   |   |      |______   |_____| |_____]   |
4207 #|       |_____| |       |       __|__ |_____ |______   |     | |       __|__
4208 #
4209 #----------------------------------------------------------------------------
4210 #----------------------------------------------------------------------------
4211
4212 # GETTERS/SETTERS
4213
4214 sub wordscores
4215 {
4216     my ( $self, $value ) = @_;
4217
4218     $self->{wordscores__} = $value if (defined $value);
4219     return $self->{wordscores__};
4220 }
4221
4222 sub wmformat
4223 {
4224     my ( $self, $value ) = @_;
4225
4226     $self->{wmformat__} = $value if (defined $value);
4227     return $self->{wmformat__};
4228 }
4229
4230 sub db
4231 {
4232     my ( $self ) = @_;
4233
4234     return $self->{db__};
4235 }
4236
4237 sub history
4238 {
4239     my ( $self, $history ) = @_;
4240
4241     $self->{history__} = $history;
4242 }
4243
4244 1;
4245