#!/usr/bin/env python # xmltraintest.py v0.02 (March 23, 2004) # by Joseph Connors import glob, sys, string, re, getopt, time from xmlrpclib import ServerProxy def help(message): if message == 0: msg = "Usage: python " + sys.argv[0] + " [options] \n" msg += "\n" msg += "Options:\n" msg += " -a Set training mode to Train Always (default is Train Only Errors)\n" msg += " -c Clear exising corpus if it exists\n" msg += " -d Display detailed information about buckets\n" msg += " -r# Display row accuracy every # rows (0 turns off row accuracy display)\n" msg += " -t Test archive against current corpus (no changes made to corpus)\n" msg += " -? Display this help message\n" msg += "\n" msg += "Warning: You should backup your corpus before using this program.\n" msg += " It will modify or replace your existing corpus." if message == 1: msg = "Error: Unable to connect to POPFile.\n" msg += " Make sure POPFile is running.\n" msg += " You must also have the XMLRPC.pm module installed in POPFile\n" msg += " and set xmlrpc_enabled to 1 on POPFile's Advanced page." return msg def compare_mf(a, b): # Python version of POPFile function to compare file names (sort order reversed). regex = re.compile("popfile(\d+)=(\d+)\.msg") ad,am = regex.findall(a)[0] bd,bm = regex.findall(b)[0] if (ad == bd): return cmp(int(am), int(bm)) else: return cmp(int(ad), int(bd)) if sys.argv[1:]: try: # get argument from command line opts, path = getopt.gnu_getopt(sys.argv[1:],"acdr:t?") TrainMethod = "toe" # default to TOE ClearCorpus = 0 # default use existing corps if it exists Detail = 0 # default does not display bucket details Row = 2 # default display row accuracy over every 2 rows (100 messages) for opt in opts: if opt[0] == "-a": TrainMethod = "ta" if opt[0] == "-c": ClearCorpus = 1 if opt[0] == "-d": Detail = 1 if opt[0] == "-r": Row = int(opt[1]) if opt[0] == "-t": # Testing Only disables TA and clear corpus options TrainMethod = "test" ClearCorpus = 0 if opt[0] == "-?": sys.exit(help(0)) if len(path) > 1: raise getopt.GetoptError("Too many parameters: [%s]" % string.join(path), None) if len(path) == 0: raise getopt.GetoptError("Archive directory must be given", None) path = path[0] except getopt.error, detail: print "Error:", detail sys.exit(1) else: sys.exit(help(0)) try: POPFile = ServerProxy("http://localhost:8081") # connect to POPFile pf = POPFile.Classifier.Bayes # just a naming shortcut key = pf.get_session_key("admin","") # get session key, send username (admin in PF v0.21) and password except: sys.exit(help(1)) try: try: errorfile = file("errors.txt","w") except: print "Error: can not write errors.txt, continuing anyway." maillist = [] # create list if ClearCorpus: # clear existing corpus, retain buckets and magnets print "Clearing existing corpus" for each in pf.get_buckets(key): pf.clear_bucket(key,each) archive = glob.glob1(path,"*") # get archive bucket folders if len(archive) == 0: sys.exit("Error: No bucket folders found in " + path) for folder in archive: # create buckets from archive folders if not folder in pf.get_buckets(key): if not TrainMethod == "test": pf.create_bucket(key,folder) else: print "Warning: Archive bucket", folder, "was not found in corpus." print " Testing will not create it, this may cause misclassification." for mail in glob.glob1(path + "/" + folder,"popfile*.msg"): maillist += [folder + "/" + mail] maillist.sort(compare_mf) # sort messages by file name print len(maillist), "messages found in", len(archive), "buckets" errors = 0 unclassified = 0 correct = 0 count = 0 rowerrors = 0 reclassified = 0 cputime = time.clock() # start cpu clock time for mail in maillist: bucket = mail.split("/")[0] result = pf.classify(key, path + "/" + mail) if result != bucket: if not TrainMethod == "test": pf.add_message_to_bucket(key, bucket, path + "/" + mail) if result == "unclassified": unclassified += 1 # count unclassified messages sys.stdout.write('?') else: errors += 1 # count misclassified messages rowerrors += 1 # count misclassified for row sys.stdout.write('-') reclassified += 1 errorfile.write(mail + "\t" + result + "\n") else: correct += 1 sys.stdout.write('+') if TrainMethod == "ta": pf.add_message_to_bucket(key, bucket, path + "/" + mail) reclassified += 1 count += 1 if Row: # print row statistics if (count % (50*Row) == 0): print " %.0f" % ((float)((50*Row)-rowerrors)/(50*Row)*100) + "%", if TrainMethod == "test": print "missed", else: print "trained", print reclassified, print "(%.1f" % ((float)(correct+unclassified)/(correct+unclassified+errors)*100) + "%)" rowerrors = 0 sys.stdout.flush() errorfile.flush() elif (count % 50 == 0): print elif (count % 79 == 0): print sys.stdout.flush() errorfile.flush() except KeyboardInterrupt: print "\n*Exited Early by user*" seconds = time.clock() - cputime print if TrainMethod == "toe": print "Train Only Errors\n" elif TrainMethod == "ta": print "Train Always\n" else: print "Testing Only (no training)\n" print "Processed", correct+unclassified+errors, "messages in", round(seconds,2), "seconds." print "Misclassified:\t", errors print "Unclassified:\t", unclassified print "Correct:\t", correct print if correct+unclassified+errors > 0: print "%.2f" % ((float)(correct+unclassified)/(correct+unclassified+errors)*100) + "% accuracy" if Detail: print print "Bucket word counts:" space = 0 for bucket in archive: x = len(bucket[0:40]) if x > space: space = x fmtst = (" %" + space.__str__() + "s") print fmtst % (""), print "%10s %10s %10s" % ("Unique", "Total", "Messages") magnets = pf.get_buckets_with_magnets(key) for bucket in archive: print fmtst % (bucket[0:40]), print "%10.1d" % (pf.get_bucket_unique_count(key,bucket)), print "%10.1d" % (pf.get_bucket_word_count(key,bucket)), print "%10.1d" % (len(glob.glob1(path + "/" + bucket,"popfile*.msg"))), if bucket in magnets: print " magnet", print print fmtst % (""), print "%10s" % ("--------"), print "%10s" % ("--------"), print "%10s" % ("--------") print fmtst % (""), print "%10d" % (pf.get_unique_word_count(key,bucket)), print "%10d" % (pf.get_word_count(key,bucket)), print "%10d" % (len(maillist)) pf.release_session_key(key)