1
2
3
4
5
6
7
8
9 """
10 Classes and interfaces for tagging each token of a document with
11 supplementary information, such as its part of speech or its WordNet
12 synset tag. This task, which is known as X{tagging}, is defined by
13 the L{TagI} interface.
14 """
15
16 from nltk_lite.probability import FreqDist, ConditionalFreqDist
17
18
19
20
21
22 from nltk_lite.tag import *
23 import re
24
26 """
27 A unigram stochastic tagger. Before C{tag.Unigram} can be
28 used, it should be trained on a tagged corpus. Using this
29 training data, it will find the most likely tag for each word
30 type. It will then use this information to assign the most
31 frequent tag to each word. If C{tag.Unigram} encounters a
32 word which it has no data, it will assign it the
33 tag C{None}.
34 """
35 yaml_tag = '!tag.Unigram'
36 - def __init__(self, cutoff=1, backoff=None):
37 """
38 Construct a new unigram stochastic tagger. The new tagger
39 should be trained, using the L{train()} method, before it is
40 used to tag data.
41 """
42 self._model = {}
43 self._cutoff = cutoff
44 self._backoff = backoff
45 self._history = None
46
47 - def train(self, tagged_corpus, verbose=False):
48 """
49 Train C{tag.Unigram} using the given training data.
50
51 @param tagged_corpus: A tagged corpus. Each item should be
52 a C{list} of tagged tokens, where each consists of
53 C{text} and a C{tag}.
54 @type tagged_corpus: C{list} or C{iter(list)}
55 """
56
57 if self.size() != 0:
58 raise ValueError, 'Tagger is already trained'
59 token_count = hit_count = 0
60 fd = ConditionalFreqDist()
61
62 if isinstance(tagged_corpus, list) and isinstance(tagged_corpus[0], tuple):
63 tagged_corpus = [tagged_corpus]
64
65 for sentence in tagged_corpus:
66 for (token, tag) in sentence:
67 token_count += 1
68 fd[token].inc(tag)
69 for token in fd.conditions():
70 best_tag = fd[token].max()
71 backoff_tag = self._backoff_tag_one(token)
72 hits = fd[token].count(best_tag)
73
74
75
76 if best_tag != backoff_tag and hits > self._cutoff:
77 self._model[token] = best_tag
78 hit_count += hits
79
80
81 if verbose:
82 size = len(self._model)
83 backoff = 100 - (hit_count * 100.0)/ token_count
84 pruning = 100 - (size * 100.0) / len(fd.conditions())
85 print "[Trained Unigram tagger:",
86 print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
87 size, backoff, pruning)
88
89 - def tag_one(self, token, history=None):
90 if self.size() == 0:
91 raise ValueError, 'Tagger is not trained'
92 if self._model.has_key(token):
93 return self._model[token]
94 if self._backoff:
95 return self._backoff.tag_one(token, history)
96 return None
97
99 return len(self._model)
100
102 return '<Unigram Tagger: size=%d, cutoff=%d>' % (
103 self.size(), self._cutoff)
104
105
106
107 -class Affix(SequentialBackoff):
108 """
109 A unigram tagger that assign tags to tokens based on leading or
110 trailing substrings (it is important to note that the substrings
111 are not necessarily "true" morphological affixes). Before
112 C{tag.Affix} can be used, it should be trained on a tagged
113 corpus. Using this training data, it will find the most likely tag
114 for each word type. It will then use this information to assign
115 the most frequent tag to each word. If the C{tag.Affix}
116 encounters a prefix or suffix in a word for which it has no data,
117 it will assign the tag C{None}.
118 """
119 - def __init__ (self, length, minlength, cutoff=1, backoff=None):
120 """
121 Construct a new affix stochastic tagger. The new tagger should be
122 trained, using the L{train()} method, before it is used to tag
123 data.
124
125 @type length: C{number}
126 @param length: The length of the affix to be considered during
127 training and tagging (negative for suffixes)
128 @type minlength: C{number}
129 @param minlength: The minimum length for a word to be considered
130 during training and tagging. It must be longer that C{length}.
131 """
132
133 self._model = {}
134
135 assert minlength > 0
136
137 self._length = length
138 self._minlength = minlength
139 self._cutoff = cutoff
140 self._backoff = backoff
141 self._history = None
142
144 if self._length > 0:
145 return token[:self._length]
146 else:
147 return token[self._length:]
148
149 - def train(self, tagged_corpus, verbose=False):
150 """
151 Train C{tag.Affix} using the given training data. If this
152 method is called multiple times, then the training data will be
153 combined.
154
155 @param tagged_corpus: A tagged corpus. Each item should be
156 a C{list} of tagged tokens, where each consists of
157 C{text} and a C{tag}.
158 @type tagged_corpus: C{list} or C{iter(list)}
159 """
160
161 if self.size() != 0:
162 raise ValueError, 'Tagger is already trained'
163 token_count = hit_count = 0
164 fd = ConditionalFreqDist()
165
166 for sentence in tagged_corpus:
167 for (token, tag) in sentence:
168 token_count += 1
169
170 if len(token) >= self._minlength:
171 backoff_tag = self._backoff_tag_one(token)
172 if tag != backoff_tag:
173
174 affix = self._get_affix(token)
175 hit_count += 1
176 fd[affix].inc(tag)
177 for affix in fd.conditions():
178 best_tag = fd[affix].max()
179 if fd[affix].count(best_tag) > self._cutoff:
180 self._model[affix] = best_tag
181
182 if verbose:
183 size = len(self._model)
184 backoff = 100 - (hit_count * 100.0)/ token_count
185 pruning = 100 - (size * 100.0) / len(fd.conditions())
186 print "[Trained Affix tagger:",
187 print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
188 size, backoff, pruning)
189
190 - def tag_one(self, token, history=None):
191 if self.size() == 0:
192 raise ValueError, 'Tagger is not trained'
193 affix = self._get_affix(token)
194 if len(token) >= self._minlength and self._model.has_key(affix):
195 return self._model[affix]
196 if self._backoff:
197 return self._backoff.tag_one(token, history)
198 return None
199
201 return len(self._model)
202
204 return '<Affix Tagger: size=%d, cutoff=%d>' % (
205 self.size(), self._cutoff)
206
207
208 -class Regexp(SequentialBackoff):
209 """
210 A tagger that assigns tags to words based on regular expressions.
211 """
212 yaml_tag = '!tag.Regexp'
213 - def __init__(self, regexps, backoff=None):
214 """
215 Construct a new regexp tagger.
216
217 @type regexps: C{list} of C{(string,string)}
218 @param regexps: A list of C{(regexp,tag)} pairs, each of
219 which indicates that a word matching C{regexp} should
220 be tagged with C{tag}. The pairs will be evalutated in
221 order. If none of the regexps match a word, then the
222 optional backoff tagger is invoked, else it is
223 assigned the tag C{None}.
224 """
225 self._regexps = regexps
226 self._backoff = backoff
227 self._history = None
228
229 - def tag_one(self, token, history=None):
236
238 return '<Regexp Tagger: size=%d>' % len(self._regexps)
239
240 -class Lookup(SequentialBackoff):
241 """
242 A tagger that assigns tags to words based on a lookup table.
243 """
244 - def __init__(self, table, backoff=None):
245 """
246 Construct a new lookup tagger.
247
248 @type table: C{dict} from C{string} to C{string}
249 @param table: A dictionary mapping words to tags,
250 which indicates that a particular Cword should be assigned
251 a given Ctag. If none of the regexps match a word, then the
252 optional backoff tagger is invoked, else it is
253 assigned the tag C{None}.
254 """
255 self._table = table
256 self._backoff = backoff
257 self._history = None
258
259 - def tag_one(self, token, history=None):
260 if token in self._table:
261 return self._table[token]
262 if self._backoff:
263 return self._backoff.tag_one(token, history)
264 return None
265
267 return '<Lookup Tagger: size=%d>' % len(self._table)
268
269
270
271
272
277
279 """
280 A simple demonstration function for the C{Tagger} classes. It
281 constructs a backoff tagger using a trigram tagger, bigram tagger
282 unigram tagger and a default tagger. It trains and tests the
283 tagger using the Brown corpus.
284 """
285 from nltk_lite.corpora import brown
286 from nltk_lite import tag
287 import sys
288
289 print 'Training taggers.'
290
291
292 t0 = tag.Default('nn')
293
294 t1 = tag.Unigram(cutoff=1, backoff=t0)
295 t1.train(brown.tagged('a'), verbose=True)
296
297 t2 = tag.Affix(-3, 5, cutoff=2, backoff=t0)
298 t2.train(brown.tagged('a'), verbose=True)
299
300 t3 = tag.Regexp([(r'.*ed', 'vbd')], backoff=t0)
301
302 t4 = tag.Lookup({'the': 'dt'}, backoff=t0)
303
304 test_tokens = []
305 num_words = 0
306
307 print '='*75
308 print 'Running the taggers on test data...'
309 print ' Default (nn) tagger: ',
310 sys.stdout.flush()
311 _demo_tagger(t0, brown.tagged('b'))
312
313 print ' Unigram tagger: ',
314 sys.stdout.flush()
315 _demo_tagger(t1, list(brown.tagged('b'))[:1000])
316
317 print ' Affix tagger: ',
318 sys.stdout.flush()
319 _demo_tagger(t2, list(brown.tagged('b'))[:1000])
320
321 print ' Regexp tagger: ',
322 sys.stdout.flush()
323 _demo_tagger(t3, list(brown.tagged('b'))[:1000])
324
325 print ' Lookup tagger: ',
326 sys.stdout.flush()
327 _demo_tagger(t4, list(brown.tagged('b'))[:1000])
328
329 if __name__ == '__main__':
330 demo()
331