root / crm / trunk / crm.py

Revision 52, 4.9 kB (checked in by phil, 16 months ago)

Changed to use an ini style config file for setup:

Line 
1#!/usr/bin/python
2"""
3Python wrapper for the CRM114 Classifier (http://crm114.sourceforge.net/).
4
5Requires the crm command to be installed in your command path or be specified in the cfg file.
6
7Uses an ini style config file.
8
9To use the module, create an instance of the Classifier class, giving it a path to the config file.
10Alternatively a space delimited list of categories can be passed in and
11the a crm.cfg file will be loaded from or created in the current dir.
12
13e.g:
14        c = Classifier("/path/to/mycrm.cgf") #to load a config file
15        c = Classifier("good bad ugly")      #to create a config in the current dir with defaults
16
17To teach the classifier object about some text, call the learn method passing in a category
18(on of the ones that you provided originally OR a new category),
19and the text.
20
21e.g:
22        c.learn("good", "some good text")
23        c.learn("bad", "some bad text")
24        c.learn("ugly","SoMee Uggly tExT")
25       
26To find out what the classifier things about some text, call the classify method passing in the text.
27The result of this method is a tuple -
28  1. the category best matching the text,
29  2. the probability of the match
30  3. the pR (see crm114 docs).
31
32e.g:
33        (classification, probability, pR) = c.classify("some text")
34
35"""
36
37__version__ = "1.1.0dev"
38
39__license__ = """
40Copyright (C) 2005 Sam Deane, 2007 Sam Deane, Phil Cooper.
41MIT LICENSE http://www.opensource.org/licenses/mit-license.php
42"""
43
44import os
45import re
46import logging
47from ConfigParser import ConfigParser
48
49#logFormat = logging.Formatter('%(asctime)s %(levelname)-8s %(filename)s#%(lineno)s %(message)s')
50logFormat = logging.Formatter('%(asctime)s %(levelname)-8s  %(message)s')
51log = logging.getLogger('crm.Classifier')
52
53crmDEFAULTS = """[crm]
54# command path where the crm executable is found
55cmdpath = crm
56
57# directory where all classification(css) files are
58# %(here)s is replaced with the directory of this file
59#dir = %(here)s/data
60dir = %(here)s
61
62# classifier to use if this changes the css files need to be recreated
63classifier = osb unique microgroom
64extension = .css
65
66# space delimited list of possible classes
67#classes = spam ham
68
69logfile = %(here)s/learning.log
70"""
71
72crmLearnCommand = "%s -u %s '-{ learn <%s> ( %s ) }'"
73crmClassifyCommand = "%s -u %s '-{ isolate (:stats:); classify <%s> ( %s ) (:stats:);output /:*:stats:/}'"
74       
75
76# wrapper for crm114
77class Classifier:
78
79        def __init__( self, file_or_classes ):
80                # Must be initialized with either classes or a config file with the classes
81                classes=file_or_classes.split()
82                if len(classes) > 1:
83                        # if there is no config file, find it or make it
84                        cfgFile='crm.cfg'
85                        if not os.path.exists(cfgFile):
86                                open(cfgFile,'w').write(crmDEFAULTS)
87                elif len(classes) == 1:
88                        # if there is one then treat it as the config file name
89                        cfgFile = classes[0]
90                        classes=''
91                config=ConfigParser({'here':os.path.dirname(os.path.abspath(cfgFile))})
92                config.read(cfgFile)
93                self.categories = classes or config.get('crm','classes').split()
94                self.path = os.path.expanduser(config.get('crm','dir'))
95                self.CmdPath = config.get('crm','cmdpath')
96                self.Classifier = config.get('crm','classifier')
97                self.Extension = config.get('crm','extension')
98                if config.has_option('crm','logfile'):
99                        logfile = os.path.expanduser(config.get('crm','logfile'))
100                        loghandler = logging.FileHandler(logfile)
101                        loghandler.setFormatter(logFormat)
102                        loghandler.setLevel(logging.INFO)
103                        log.setLevel(logging.INFO)
104                        log.addHandler(loghandler)
105                self.makeFiles()
106               
107        # learn the classifier what category some new text is in
108        def learn( self, category, text ):
109                command = crmLearnCommand % (self.CmdPath, self.path, self.Classifier, category + self.Extension )
110                log.debug("Learn: "+command)
111                pipe = os.popen( command, 'w' )
112                pipe.write( text )
113                pipe.close()
114                if len(text):
115                        log.info('Learn: %s <%s>'% ( category,text ))
116       
117        # ask the classifier what category best matches some text       
118        def classify( self, text, choices='' ):
119                """Given a string of text will return the classification
120                    returns (catetory, probability, pR) tuple"""
121                choices = choices.split() or self.categories
122                files = [cat+self.Extension for cat in choices]
123                command = crmClassifyCommand % (self.CmdPath, self.path, self.Classifier , ' '.join(files))
124                log.debug("Classify: %s" % (command))
125                (fin, fout) = os.popen2( command )
126                fin.write( text )
127                fin.close()
128                stats = fout.read()
129                fout.close()
130                # lets parse the result
131                pattern = r"Best match to file .. \(.*?([a-zA-Z0-9_-]+)%s\) +prob: *([0-9.]+) +pR: *([0-9.-]+)" % (self.Extension)
132                statsfound = re.search(pattern,stats)
133                if statsfound:
134                    cat, prob, pR = statsfound.groups()
135                    return (cat, float(prob), float(pR))
136                else:
137                    raise RuntimeError(stats)
138
139        # ensure that data files exist, by calling learn with an empty string
140        def makeFiles( self ):
141                # make directory if necessary
142                if not os.path.exists( self.path ):
143                                os.mkdir( self.path )
144
145                # make category files
146                for category in self.categories:
147                        self.learn( category, "" )
Note: See TracBrowser for help on using the browser.