| 1 | #!/usr/bin/python |
|---|
| 2 | """ |
|---|
| 3 | Python wrapper for the CRM114 Classifier (http://crm114.sourceforge.net/). |
|---|
| 4 | |
|---|
| 5 | Requires the crm command to be installed in your command path or be specified in the cfg file. |
|---|
| 6 | |
|---|
| 7 | Uses an ini style config file. |
|---|
| 8 | |
|---|
| 9 | To use the module, create an instance of the Classifier class, giving it a path to the config file. |
|---|
| 10 | Alternatively a space delimited list of categories can be passed in and |
|---|
| 11 | the a crm.cfg file will be loaded from or created in the current dir. |
|---|
| 12 | |
|---|
| 13 | e.g: |
|---|
| 14 | c = Classifier("/path/to/mycrm.cgf") #to load a config file |
|---|
| 15 | c = Classifier("good bad ugly") #to create a config in the current dir with defaults |
|---|
| 16 | |
|---|
| 17 | To teach the classifier object about some text, call the learn method passing in a category |
|---|
| 18 | (on of the ones that you provided originally OR a new category), |
|---|
| 19 | and the text. |
|---|
| 20 | |
|---|
| 21 | e.g: |
|---|
| 22 | c.learn("good", "some good text") |
|---|
| 23 | c.learn("bad", "some bad text") |
|---|
| 24 | c.learn("ugly","SoMee Uggly tExT") |
|---|
| 25 | |
|---|
| 26 | To find out what the classifier things about some text, call the classify method passing in the text. |
|---|
| 27 | The result of this method is a tuple - |
|---|
| 28 | 1. the category best matching the text, |
|---|
| 29 | 2. the probability of the match |
|---|
| 30 | 3. the pR (see crm114 docs). |
|---|
| 31 | |
|---|
| 32 | e.g: |
|---|
| 33 | (classification, probability, pR) = c.classify("some text") |
|---|
| 34 | |
|---|
| 35 | """ |
|---|
| 36 | |
|---|
| 37 | __version__ = "1.1.0dev" |
|---|
| 38 | |
|---|
| 39 | __license__ = """ |
|---|
| 40 | Copyright (C) 2005 Sam Deane, 2007 Sam Deane, Phil Cooper. |
|---|
| 41 | MIT LICENSE http://www.opensource.org/licenses/mit-license.php |
|---|
| 42 | """ |
|---|
| 43 | |
|---|
| 44 | import os |
|---|
| 45 | import re |
|---|
| 46 | import logging |
|---|
| 47 | from ConfigParser import ConfigParser |
|---|
| 48 | |
|---|
| 49 | #logFormat = logging.Formatter('%(asctime)s %(levelname)-8s %(filename)s#%(lineno)s %(message)s') |
|---|
| 50 | logFormat = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s') |
|---|
| 51 | log = logging.getLogger('crm.Classifier') |
|---|
| 52 | |
|---|
| 53 | crmDEFAULTS = """[crm] |
|---|
| 54 | # command path where the crm executable is found |
|---|
| 55 | cmdpath = crm |
|---|
| 56 | |
|---|
| 57 | # directory where all classification(css) files are |
|---|
| 58 | # %(here)s is replaced with the directory of this file |
|---|
| 59 | #dir = %(here)s/data |
|---|
| 60 | dir = %(here)s |
|---|
| 61 | |
|---|
| 62 | # classifier to use if this changes the css files need to be recreated |
|---|
| 63 | classifier = osb unique microgroom |
|---|
| 64 | extension = .css |
|---|
| 65 | |
|---|
| 66 | # space delimited list of possible classes |
|---|
| 67 | #classes = spam ham |
|---|
| 68 | |
|---|
| 69 | logfile = %(here)s/learning.log |
|---|
| 70 | """ |
|---|
| 71 | |
|---|
| 72 | crmLearnCommand = "%s -u %s '-{ learn <%s> ( %s ) }'" |
|---|
| 73 | crmClassifyCommand = "%s -u %s '-{ isolate (:stats:); classify <%s> ( %s ) (:stats:);output /:*:stats:/}'" |
|---|
| 74 | |
|---|
| 75 | |
|---|
| 76 | # wrapper for crm114 |
|---|
| 77 | class Classifier: |
|---|
| 78 | |
|---|
| 79 | def __init__( self, file_or_classes ): |
|---|
| 80 | # Must be initialized with either classes or a config file with the classes |
|---|
| 81 | classes=file_or_classes.split() |
|---|
| 82 | if len(classes) > 1: |
|---|
| 83 | # if there is no config file, find it or make it |
|---|
| 84 | cfgFile='crm.cfg' |
|---|
| 85 | if not os.path.exists(cfgFile): |
|---|
| 86 | open(cfgFile,'w').write(crmDEFAULTS) |
|---|
| 87 | elif len(classes) == 1: |
|---|
| 88 | # if there is one then treat it as the config file name |
|---|
| 89 | cfgFile = classes[0] |
|---|
| 90 | classes='' |
|---|
| 91 | config=ConfigParser({'here':os.path.dirname(os.path.abspath(cfgFile))}) |
|---|
| 92 | config.read(cfgFile) |
|---|
| 93 | self.categories = classes or config.get('crm','classes').split() |
|---|
| 94 | self.path = os.path.expanduser(config.get('crm','dir')) |
|---|
| 95 | self.CmdPath = config.get('crm','cmdpath') |
|---|
| 96 | self.Classifier = config.get('crm','classifier') |
|---|
| 97 | self.Extension = config.get('crm','extension') |
|---|
| 98 | if config.has_option('crm','logfile'): |
|---|
| 99 | logfile = os.path.expanduser(config.get('crm','logfile')) |
|---|
| 100 | loghandler = logging.FileHandler(logfile) |
|---|
| 101 | loghandler.setFormatter(logFormat) |
|---|
| 102 | loghandler.setLevel(logging.INFO) |
|---|
| 103 | log.setLevel(logging.INFO) |
|---|
| 104 | log.addHandler(loghandler) |
|---|
| 105 | self.makeFiles() |
|---|
| 106 | |
|---|
| 107 | # learn the classifier what category some new text is in |
|---|
| 108 | def learn( self, category, text ): |
|---|
| 109 | command = crmLearnCommand % (self.CmdPath, self.path, self.Classifier, category + self.Extension ) |
|---|
| 110 | log.debug("Learn: "+command) |
|---|
| 111 | pipe = os.popen( command, 'w' ) |
|---|
| 112 | pipe.write( text ) |
|---|
| 113 | pipe.close() |
|---|
| 114 | if len(text): |
|---|
| 115 | log.info('Learn: %s <%s>'% ( category,text )) |
|---|
| 116 | |
|---|
| 117 | # ask the classifier what category best matches some text |
|---|
| 118 | def classify( self, text, choices='' ): |
|---|
| 119 | """Given a string of text will return the classification |
|---|
| 120 | returns (catetory, probability, pR) tuple""" |
|---|
| 121 | choices = choices.split() or self.categories |
|---|
| 122 | files = [cat+self.Extension for cat in choices] |
|---|
| 123 | command = crmClassifyCommand % (self.CmdPath, self.path, self.Classifier , ' '.join(files)) |
|---|
| 124 | log.debug("Classify: %s" % (command)) |
|---|
| 125 | (fin, fout) = os.popen2( command ) |
|---|
| 126 | fin.write( text ) |
|---|
| 127 | fin.close() |
|---|
| 128 | stats = fout.read() |
|---|
| 129 | fout.close() |
|---|
| 130 | # lets parse the result |
|---|
| 131 | pattern = r"Best match to file .. \(.*?([a-zA-Z0-9_-]+)%s\) +prob: *([0-9.]+) +pR: *([0-9.-]+)" % (self.Extension) |
|---|
| 132 | statsfound = re.search(pattern,stats) |
|---|
| 133 | if statsfound: |
|---|
| 134 | cat, prob, pR = statsfound.groups() |
|---|
| 135 | return (cat, float(prob), float(pR)) |
|---|
| 136 | else: |
|---|
| 137 | raise RuntimeError(stats) |
|---|
| 138 | |
|---|
| 139 | # ensure that data files exist, by calling learn with an empty string |
|---|
| 140 | def makeFiles( self ): |
|---|
| 141 | # make directory if necessary |
|---|
| 142 | if not os.path.exists( self.path ): |
|---|
| 143 | os.mkdir( self.path ) |
|---|
| 144 | |
|---|
| 145 | # make category files |
|---|
| 146 | for category in self.categories: |
|---|
| 147 | self.learn( category, "" ) |
|---|