1
2
3
4
5
6
7
8
9
10 """
11 Brill's transformational rule-based tagger.
12 """
13
14 from nltk_lite.tag import TagI
15
16 import bisect
17 import random
18 import yaml
19 from nltk_lite import yamltags
20
21
22
23
24
25 -class Brill(yaml.YAMLObject):
26 """
27 Brill's transformational rule-based tagger. Brill taggers use an
28 X{initial tagger} (such as L{tag.Default}) to assign an intial
29 tag sequence to a text; and then apply an ordered list of
30 transformational rules to correct the tags of individual tokens.
31 These transformation rules are specified by the L{BrillRuleI}
32 interface.
33
34 Brill taggers can be created directly, from an initial tagger and
35 a list of transformational rules; but more often, Brill taggers
36 are created by learning rules from a training corpus, using either
37 L{BrillTrainer} or L{FastBrillTrainer}.
38 """
39
40 yaml_tag = '!tag.Brill'
41 - def __init__(self, initial_tagger, rules):
42 """
43 @param initial_tagger: The initial tagger
44 @type initial_tagger: L{TagI}
45 @param rules: An ordered list of transformation rules that
46 should be used to correct the initial tagging.
47 @type rules: C{list} of L{BrillRuleI}
48 """
49 self._initial_tagger = initial_tagger
50 self._rules = rules
51
54
55 - def tag (self, tokens):
56
57
58
59 tagged_tokens = list(self._initial_tagger.tag(tokens))
60
61
62
63 tag_to_positions = {}
64 for i, (token, tag) in enumerate(tagged_tokens):
65 if tag not in tag_to_positions:
66 tag_to_positions[tag] = set([i])
67 else:
68 tag_to_positions[tag].add(i)
69
70
71
72 for rule in self._rules:
73
74 positions = tag_to_positions.get(rule.original_tag(), [])
75
76 changed = rule.apply_at(tagged_tokens, positions)
77
78
79 for i in changed:
80 tag_to_positions[rule.original_tag()].remove(i)
81 if rule.replacement_tag() not in tag_to_positions:
82 tag_to_positions[rule.replacement_tag()] = set([i])
83 else:
84 tag_to_positions[rule.replacement_tag()].add(i)
85 for t in tagged_tokens:
86 yield t
87
88
89
90
91
93 """
94 An interface for tag transformations on a tagged corpus, as
95 performed by brill taggers. Each transformation finds all tokens
96 in the corpus that are tagged with a specific X{original tag} and
97 satisfy a specific X{condition}, and replaces their tags with a
98 X{replacement tag}. For any given transformation, the original
99 tag, replacement tag, and condition are fixed. Conditions may
100 depend on the token under consideration, as well as any other
101 tokens in the corpus.
102
103 Brill rules must be comparable and hashable.
104 """
106 """
107 Apply this rule everywhere it applies in the corpus. I.e.,
108 for each token in the corpus that is tagged with this rule's
109 original tag, and that satisfies this rule's condition, set
110 its tag to be this rule's replacement tag.
111
112 @param tokens: The tagged corpus
113 @type tokens: C{list} of C{tuple}
114 @return: The indices of tokens whose tags were changed by this
115 rule.
116 @rtype: C{list} of C{int}
117 """
118 return self.apply_at(tokens, range(len(tokens)))
119
121 """
122 Apply this rule at every position in C{positions} where it
123 applies to the corpus. I.e., for each position M{p} in
124 C{positions}, if C{tokens[M{p}]} is tagged with this rule's
125 original tag, and satisfies this rule's condition, then set
126 its tag to be this rule's replacement tag.
127
128 @param tokens: The tagged corpus
129 @type tokens: list of Token
130 @type positions: C{list} of C{int}
131 @param positions: The positions where the transformation is to
132 be tried.
133 @return: The indices of tokens whose tags were changed by this
134 rule.
135 @rtype: C{int}
136 """
137 assert False, "BrillRuleI is an abstract interface"
138
140 """
141 @return: True if the rule would change the tag of
142 C{tokens[index]}, False otherwise
143 @rtype: Boolean
144
145 @param tokens: A tagged corpus
146 @type tokens: list of Token
147 @param index: The index to check
148 @type index: int
149 """
150 assert False, "BrillRuleI is an abstract interface"
151
153 """
154 @return: The tag which this C{BrillRuleI} may cause to be
155 replaced.
156 @rtype: any
157 """
158 assert False, "BrillRuleI is an abstract interface"
159
161 """
162 @return: the tag with which this C{BrillRuleI} may replace
163 another tag.
164 @rtype: any
165 """
166 assert False, "BrillRuleI is an abstract interface"
167
168
170 assert False, "Brill rules must be comparable"
172 assert False, "Brill rules must be hashable"
173
174
176 """
177 An abstract base class for brill rules whose condition checks for
178 the presence of tokens with given properties at given ranges of
179 positions, relative to the token.
180
181 Each subclass of proximate tokens brill rule defines a method
182 M{extract_property}, which extracts a specific property from the
183 the token, such as its text or tag. Each instance is
184 parameterized by a set of tuples, specifying ranges of positions
185 and property values to check for in those ranges:
186
187 - (M{start}, M{end}, M{value})
188
189 The brill rule is then applicable to the M{n}th token iff:
190
191 - The M{n}th token is tagged with the rule's original tag; and
192 - For each (M{start}, M{end}, M{value}) triple:
193 - The property value of at least one token between
194 M{n+start} and M{n+end} (inclusive) is M{value}.
195
196 For example, a proximate token brill template with M{start=end=-1}
197 generates rules that check just the property of the preceding
198 token. Note that multiple properties may be included in a single
199 rule; the rule applies if they all hold.
200 """
201
202 - def __init__(self, original_tag, replacement_tag, *conditions):
203 """
204
205 Construct a new brill rule that changes a token's tag from
206 C{original_tag} to C{replacement_tag} if all of the properties
207 specified in C{conditions} hold.
208
209 @type conditions: C{tuple} of C{(int, int, *)}
210 @param conditions: A list of 3-tuples C{(start, end, value)},
211 each of which specifies that the property of at least one
212 token between M{n}+C{start} and M{n}+C{end} (inclusive) is
213 C{value}.
214 @raise ValueError: If C{start}>C{end} for any condition.
215 """
216 assert self.__class__ != ProximateTokensRule, \
217 "ProximateTokensRule is an abstract base class"
218
219 self._original = original_tag
220 self._replacement = replacement_tag
221 self._conditions = conditions
222 for (s,e,v) in conditions:
223 if s>e:
224 raise ValueError('Condition %s has an invalid range' %
225 ((s,e,v),))
226
227
228 @classmethod
236 @classmethod
238 map = loader.construct_mapping(node, deep=True)
239 return cls(map['original'], map['replacement'],
240 *(tuple(x) for x in map['conditions']))
241
243 """
244 Returns some property characterizing this token, such as its
245 base lexical item or its tag.
246
247 Each implentation of this method should correspond to an
248 implementation of the method with the same name in a subclass
249 of L{ProximateTokensTemplate}.
250
251 @param token: The token
252 @type token: Token
253 @return: The property
254 @rtype: any
255 """
256 assert False, "ProximateTokensRule is an abstract interface"
257 extract_property = staticmethod(extract_property)
258
260
261
262
263 change = []
264 for i in positions:
265 if self.applies(tokens, i):
266 change.append(i)
267
268
269
270
271 for i in change:
272 (token, tag) = tokens[i]
273 tokens[i] = (token, self._replacement)
274
275 return change
276
278
279
280
281 if tokens[index][1] != self._original:
282 return False
283
284
285 for (start, end, val) in self._conditions:
286
287 s = max(0, index+start)
288 e = min(index+end+1, len(tokens))
289
290
291 for i in range(s, e):
292 if self.extract_property(tokens[i]) == val:
293 break
294 else:
295
296 return False
297
298
299 return True
300
302
303 return self._original
304
306
307 return self._replacement
308
310 return (other != None and
311 other.__class__ == self.__class__ and
312 self._original == other._original and
313 self._replacement == other._replacement and
314 self._conditions == other._conditions)
315
317 return hash( (self._original, self._replacement, self._conditions,
318 self.__class__.__name__) )
319
321 conditions = ' and '.join(['%s in %d...%d' % (v,s,e)
322 for (s,e,v) in self._conditions])
323 return '<%s: %s->%s if %s>' % (self.__class__.__name__,
324 self._original, self._replacement,
325 conditions)
326
328 replacement = '%s -> %s' % (self._original,
329 self._replacement)
330 if len(self._conditions) == 0:
331 conditions = ''
332 else:
333 conditions = ' if '+ ', and '.join([self._condition_to_str(c)
334 for c in self._conditions])
335 return replacement+conditions
336
345
347 """
348 Return a string representation for the given range. This
349 helper method is used by L{__str__}.
350 """
351 if start == end == 0:
352 return 'this word'
353 if start == end == -1:
354 return 'the preceding word'
355 elif start == end == 1:
356 return 'the following word'
357 elif start == end and start < 0:
358 return 'word i-%d' % -start
359 elif start == end and start > 0:
360 return 'word i+%d' % start
361 else:
362 if start >= 0: start = '+%d' % start
363 if end >= 0: end = '+%d' % end
364 return 'words i%s...i%s' % (start, end)
365
377 extract_property = staticmethod(extract_property)
378
380 """
381 A rule which examines the base types of nearby tokens.
382 @see: L{ProximateTokensRule} for details.
383 @see: L{ProximateWordsTemplate}, which generates these rules.
384 """
385 PROPERTY_NAME = 'text'
386 yaml_tag = '!ProximateWordsRule'
388 """@return: The given token's text."""
389 return token[0]
390 extract_property = staticmethod(extract_property)
391
392
393
394
395
397 """
398 An interface for generating lists of transformational rules that
399 apply at given corpus positions. C{BrillTemplateI} is used by
400 C{Brill} training algorithms to generate candidate rules.
401 """
404
406 """
407 Return a list of the transformational rules that would correct
408 the C{i}th subtoken's tag in the given token. In particular,
409 return a list of zero or more rules that would change
410 C{tagged_tokens[i][1]} to C{correctTag}, if applied
411 to C{token}.
412
413 If the C{i}th subtoken already has the correct tag (i.e., if
414 C{tagged_tokens[i][1]} == C{correctTag}), then
415 C{applicable_rules} should return the empty list.
416
417 @param tokens: The tagged tokens being tagged.
418 @type tokens: C{list} of C{tuple}
419 @param i: The index of the token whose tag should be corrected.
420 @type i: C{int}
421 @param correctTag: The correct tag for the C{i}th token.
422 @type correctTag: (any)
423 @rtype: C{list} of L{BrillRuleI}
424 """
425 raise AssertionError, "BrillTemplateI is an abstract interface"
426
428 """
429 Returns the set of indices C{i} such that
430 C{applicable_rules(token, index, ...)} depends on the value of
431 the C{i}th subtoken of C{token}.
432
433 This method is used by the \"fast\" Brill tagger trainer.
434
435 @param token: The tokens being tagged.
436 @type token: C{list} of C{tuple}
437 @param index: The index whose neighborhood should be returned.
438 @type index: C{int}
439 @rtype: C{Set}
440 """
441 raise AssertionError, "BrillTemplateI is an abstract interface"
442
444 """
445 An brill templates that generates a list of
446 L{ProximateTokensRule}s that apply at a given corpus
447 position. In particular, each C{ProximateTokensTemplate} is
448 parameterized by a proximate token brill rule class and a list of
449 boundaries, and generates all rules that:
450
451 - use the given brill rule class
452 - use the given list of boundaries as the C{start} and C{end}
453 points for their conditions
454 - are applicable to the given token.
455 """
456 - def __init__(self, rule_class, *boundaries):
457 """
458 Construct a template for generating proximate token brill
459 rules.
460
461 @type rule_class: C{class}
462 @param rule_class: The proximate token brill rule class that
463 should be used to generate new rules. This class must be a
464 subclass of L{ProximateTokensRule}.
465 @type boundaries: C{tuple} of C{(int, int)}
466 @param boundaries: A list of tuples C{(start, end)}, each of
467 which specifies a range for which a condition should be
468 created by each rule.
469 @raise ValueError: If C{start}>C{end} for any boundary.
470 """
471 self._rule_class = rule_class
472 self._boundaries = boundaries
473 for (s,e) in boundaries:
474 if s>e:
475 raise ValueError('Boundary %s has an invalid range' %
476 ((s,e),))
477
479 if tokens[index][1] == correct_tag:
480 return []
481
482
483
484 applicable_conditions = \
485 [self._applicable_conditions(tokens, index, start, end)
486 for (start, end) in self._boundaries]
487
488
489
490
491 condition_combos = [[]]
492 for conditions in applicable_conditions:
493 condition_combos = [old_conditions+[new_condition]
494 for old_conditions in condition_combos
495 for new_condition in conditions]
496
497
498 return [self._rule_class(tokens[index][1], correct_tag, *conds)
499 for conds in condition_combos]
500
502 """
503 @return: A set of all conditions for proximate token rules
504 that are applicable to C{tokens[index]}, given boundaries of
505 C{(start, end)}. I.e., return a list of all tuples C{(start,
506 end, M{value})}, such the property value of at least one token
507 between M{index+start} and M{index+end} (inclusive) is
508 M{value}.
509 """
510 conditions = set()
511 s = max(0, index+start)
512 e = min(index+end+1, len(tokens))
513 for i in range(s, e):
514 value = self._rule_class.extract_property(tokens[i])
515 conditions.add( (start, end, value) )
516 return conditions
517
519
520 neighborhood = set([index])
521 for (start, end) in self._boundaries:
522 s = max(0, index+start)
523 e = min(index+end+1, len(tokens))
524 for i in range(s, e):
525 neighborhood.add(i)
526
527 return neighborhood
528
530 """
531 Simulates two L{ProximateTokensTemplate}s which are symmetric
532 across the location of the token. For rules of the form \"If the
533 M{n}th token is tagged C{A}, and any tag preceding B{or} following
534 the M{n}th token by a distance between M{x} and M{y} is C{B}, and
535 ... , then change the tag of the nth token from C{A} to C{C}.\"
536
537 One C{ProximateTokensTemplate} is formed by passing in the
538 same arguments given to this class's constructor: tuples
539 representing intervals in which a tag may be found. The other
540 C{ProximateTokensTemplate} is constructed with the negative
541 of all the arguments in reversed order. For example, a
542 C{SymmetricProximateTokensTemplate} using the pair (-2,-1) and the
543 constructor C{ProximateTagsTemplate} generates the same rules as a
544 C{ProximateTagsTemplate} using (-2,-1) plus a second
545 C{ProximateTagsTemplate} using (1,2).
546
547 This is useful because we typically don't want templates to
548 specify only \"following\" or only \"preceding\"; we'd like our
549 rules to be able to look in either direction.
550 """
551 - def __init__(self, rule_class, *boundaries):
552 """
553 Construct a template for generating proximate token brill
554 rules.
555
556 @type rule_class: C{class}
557 @param rule_class: The proximate token brill rule class that
558 should be used to generate new rules. This class must be a
559 subclass of L{ProximateTokensRule}.
560 @type boundaries: C{tuple} of C{(int, int)}
561 @param boundaries: A list of tuples C{(start, end)}, each of
562 which specifies a range for which a condition should be
563 created by each rule.
564 @raise ValueError: If C{start}>C{end} for any boundary.
565 """
566 self._ptt1 = ProximateTokensTemplate(rule_class, *boundaries)
567 reversed = [(-e,-s) for (s,e) in boundaries]
568 self._ptt2 = ProximateTokensTemplate(rule_class, *reversed)
569
570
572 """
573 See L{BrillTemplateI} for full specifications.
574
575 @rtype: list of ProximateTokensRule
576 """
577 return (self._ptt1.applicable_rules(tokens, index, correctTag) +
578 self._ptt2.applicable_rules(tokens, index, correctTag))
579
585
586
587
588
589
591 """
592 A trainer for brill taggers.
593 """
594 - def __init__(self, initial_tagger, templates, trace=0):
595 self._initial_tagger = initial_tagger
596 self._templates = templates
597 self._trace = trace
598
599
600
601
602
603 - def train(self, train_tokens, max_rules=200, min_score=2):
604 """
605 Trains the Brill tagger on the corpus C{train_token},
606 producing at most C{max_rules} transformations, each of which
607 reduces the net number of errors in the corpus by at least
608 C{min_score}.
609
610 @type train_tokens: C{list} of L{tuple}
611 @param train_tokens: The corpus of tagged tokens
612 @type max_rules: C{int}
613 @param max_rules: The maximum number of transformations to be created
614 @type min_score: C{int}
615 @param min_score: The minimum acceptable net error reduction
616 that each transformation must produce in the corpus.
617 """
618 if self._trace > 0: print ("Training Brill tagger on %d tokens..." %
619 len(train_tokens))
620
621
622
623
624
625 test_tokens = list(self._initial_tagger.tag(t[0] for t in train_tokens))
626
627 if self._trace > 2: self._trace_header()
628
629
630 rules = []
631 try:
632 while len(rules) < max_rules:
633 old_tags = [t[1] for t in test_tokens]
634 (rule, score, fixscore) = self._best_rule(test_tokens,
635 train_tokens)
636 if rule is None or score < min_score:
637 if self._trace > 1:
638 print 'Insufficient improvement; stopping'
639 break
640 else:
641
642 rules.append(rule)
643
644 k = rule.apply_to(test_tokens)
645
646 if self._trace > 1:
647 self._trace_rule(rule, score, fixscore, len(k))
648
649 except KeyboardInterrupt: pass
650
651
652 return Brill(self._initial_tagger, rules)
653
654
655
656
657
658
659
661
662
663
664
665 correct_indices = {}
666 for i in range(len(test_tokens)):
667 if test_tokens[i][1] == train_tokens[i][1]:
668 tag = test_tokens[i][1]
669 correct_indices.setdefault(tag, []).append(i)
670
671
672
673
674 rules = self._find_rules(test_tokens, train_tokens)
675
676
677 best_rule, best_score, best_fixscore = None, 0, 0
678
679
680
681
682 for (rule, fixscore) in rules:
683
684
685
686 if best_score >= fixscore:
687 return best_rule, best_score, best_fixscore
688
689
690
691
692 score = fixscore
693 if correct_indices.has_key(rule.original_tag()):
694 for i in correct_indices[rule.original_tag()]:
695 if rule.applies(test_tokens, i):
696 score -= 1
697
698
699 if score <= best_score: break
700
701
702
703
704
705 if score > best_score:
706 best_rule, best_score, best_fixscore = rule, score, fixscore
707
708
709 return best_rule, best_score, best_fixscore
710
712 """
713 Find all rules that correct at least one token's tag in
714 C{test_tokens}.
715
716 @return: A list of tuples C{(rule, fixscore)}, where C{rule}
717 is a brill rule and C{fixscore} is the number of tokens
718 whose tag the rule corrects. Note that C{fixscore} does
719 I{not} include the number of tokens whose tags are changed
720 to incorrect values.
721 """
722
723
724 error_indices = [i for i in range(len(test_tokens))
725 if (test_tokens[i][1] !=
726 train_tokens[i][1])]
727
728
729
730 rule_score_dict = {}
731 for i in range(len(test_tokens)):
732 rules = self._find_rules_at(test_tokens, train_tokens, i)
733 for rule in rules:
734 rule_score_dict[rule] = rule_score_dict.get(rule,0) + 1
735
736
737
738 rule_score_items = rule_score_dict.items()
739 temp = [(-score, rule) for (rule, score) in rule_score_items]
740 temp.sort()
741 return [(rule, -negscore) for (negscore, rule) in temp]
742
744 """
745 @rtype: C{Set}
746 @return: the set of all rules (based on the templates) that
747 correct token C{i}'s tag in C{test_tokens}.
748 """
749
750 applicable_rules = set()
751 if test_tokens[i][1] != train_tokens[i][1]:
752 correct_tag = train_tokens[i][1]
753 for template in self._templates:
754 new_rules = template.applicable_rules(test_tokens, i,
755 correct_tag)
756 applicable_rules.update(new_rules)
757
758 return applicable_rules
759
760
761
762
763
765 print """
766 B |
767 S F r O | Score = Fixed - Broken
768 c i o t | R Fixed = num tags changed incorrect -> correct
769 o x k h | u Broken = num tags changed correct -> incorrect
770 r e e e | l Other = num tags changed incorrect -> incorrect
771 e d n r | e
772 ------------------+-------------------------------------------------------
773 """.rstrip()
774
775 - def _trace_rule(self, rule, score, fixscore, numchanges):
776 if self._trace > 2:
777 print ('%4d%4d%4d%4d ' % (score, fixscore, fixscore-score,
778 numchanges-fixscore*2+score)), '|',
779 print rule
780
781
782
783
784
786 """
787 A faster trainer for brill taggers.
788 """
789 - def __init__(self, initial_tagger, templates, trace=0):
790 self._initial_tagger = initial_tagger
791 self._templates = templates
792 self._trace = trace
793
794
795
796
797
798 - def train(self, train_tokens, max_rules=200, min_score=2):
799
800
801
802 TESTING = False
803
804
805
806
807
808 rulesByPosition = []
809 for i in range(len(train_tokens)):
810 rulesByPosition.append(set())
811
812
813
814
815 positionsByRule = {}
816
817
818 rulesByScore = {0:{}}
819
820 ruleScores = {}
821
822 tagIndices = {}
823
824
825
826
827
828
829
830 firstUnknownIndex = {}
831
832
833
834 def _initRule (rule):
835 positionsByRule[rule] = {}
836 rulesByScore[0][rule] = None
837 ruleScores[rule] = 0
838 firstUnknownIndex[rule] = 0
839
840
841
842 def _updateRuleApplies (rule, i):
843
844
845
846 if positionsByRule[rule].has_key(i):
847 return
848
849 if rule.replacement_tag() == train_tokens[i][1]:
850 positionsByRule[rule][i] = 1
851 elif rule.original_tag() == train_tokens[i][1]:
852 positionsByRule[rule][i] = -1
853 else:
854 positionsByRule[rule][i] = 0
855
856
857 del rulesByScore[ruleScores[rule]][rule]
858 ruleScores[rule] += positionsByRule[rule][i]
859 if not rulesByScore.has_key(ruleScores[rule]):
860 rulesByScore[ruleScores[rule]] = {}
861 rulesByScore[ruleScores[rule]][rule] = None
862 rulesByPosition[i].add(rule)
863
864
865
866 def _updateRuleNotApplies (rule, i):
867 del rulesByScore[ruleScores[rule]][rule]
868 ruleScores[rule] -= positionsByRule[rule][i]
869 if not rulesByScore.has_key(ruleScores[rule]):
870 rulesByScore[ruleScores[rule]] = {}
871 rulesByScore[ruleScores[rule]][rule] = None
872
873 del positionsByRule[rule][i]
874 rulesByPosition[i].remove(rule)
875
876
877
878 tagged_tokens = list(self._initial_tagger.tag(t[0] for t in train_tokens))
879
880
881 errorIndices = []
882 for i in range(len(tagged_tokens)):
883 tag = tagged_tokens[i][1]
884 if tag != train_tokens[i][1]:
885 errorIndices.append(i)
886 if not tagIndices.has_key(tag):
887 tagIndices[tag] = []
888 tagIndices[tag].append(i)
889
890 print "Finding useful rules..."
891
892 for i in errorIndices:
893 for template in self._templates:
894
895 for rule in template.applicable_rules(tagged_tokens, i,
896 train_tokens[i][1]):
897 if not positionsByRule.has_key(rule):
898 _initRule(rule)
899 _updateRuleApplies(rule, i)
900
901 print "Done initializing %i useful rules." %len(positionsByRule)
902
903 if TESTING:
904 after = -1
905
906
907 maxScore = max(rulesByScore.keys())
908 rules = []
909 while len(rules) < max_rules and maxScore >= min_score:
910
911
912
913
914
915
916
917
918
919 bestRule = None
920 bestRules = rulesByScore[maxScore].keys()
921
922 for rule in bestRules:
923
924
925 ti = bisect.bisect_left(tagIndices[rule.original_tag()],
926 firstUnknownIndex[rule])
927 for nextIndex in tagIndices[rule.original_tag()][ti:]:
928 if rule.applies(tagged_tokens, nextIndex):
929 _updateRuleApplies(rule, nextIndex)
930 if ruleScores[rule] < maxScore:
931 firstUnknownIndex[rule] = nextIndex+1
932 break
933
934
935 if ruleScores[rule] == maxScore:
936 firstUnknownIndex[rule] = len(tagged_tokens)
937 print "%i) %s (score: %i)" %(len(rules)+1, rule, maxScore)
938 bestRule = rule
939 break
940
941 if bestRule == None:
942 del rulesByScore[maxScore]
943 maxScore = max(rulesByScore.keys())
944 continue
945
946
947 if TESTING:
948 before = len(_errorPositions(tagged_tokens, train_tokens))
949 print "There are %i errors before applying this rule." %before
950 assert after == -1 or before == after, \
951 "after=%i but before=%i" %(after,before)
952
953 print "Applying best rule at %i locations..." \
954 %len(positionsByRule[bestRule].keys())
955
956
957
958
959
960 rules.append(bestRule)
961 bestRule.apply_at(tagged_tokens, positionsByRule[bestRule].keys())
962
963
964 for i in positionsByRule[bestRule].keys():
965
966
967 oldIndex = bisect.bisect_left(tagIndices[bestRule.original_tag()], i)
968 del tagIndices[bestRule.original_tag()][oldIndex]
969
970
971 if not tagIndices.has_key(bestRule.replacement_tag()):
972 tagIndices[bestRule.replacement_tag()] = []
973 newIndex = bisect.bisect_left(tagIndices[bestRule.replacement_tag()], i)
974 tagIndices[bestRule.replacement_tag()].insert(newIndex, i)
975
976
977
978
979
980
981
982
983
984 print "Updating neighborhoods of changed sites.\n"
985
986
987 neighbors = set()
988 for i in positionsByRule[bestRule].keys():
989 for template in self._templates:
990 neighbors.update(template.get_neighborhood(tagged_tokens, i))
991
992
993 c = d = e = 0
994 for i in neighbors:
995 siteRules = set()
996 for template in self._templates:
997
998 siteRules.update(set(template.applicable_rules(
999 tagged_tokens, i, train_tokens[i][1])))
1000
1001
1002 for obsolete in rulesByPosition[i] - siteRules:
1003 c += 1
1004 _updateRuleNotApplies(obsolete, i)
1005
1006
1007 for newRule in siteRules - rulesByPosition[i]:
1008 d += 1
1009 if not positionsByRule.has_key(newRule):
1010 e += 1
1011 _initRule(newRule)
1012 _updateRuleApplies(newRule, i)
1013
1014 if TESTING:
1015 after = before - maxScore
1016 print "%i obsolete rule applications, %i new ones, " %(c,d)+ \
1017 "using %i previously-unseen rules." %e
1018
1019 maxScore = max(rulesByScore.keys())
1020
1021
1022 if self._trace > 0: print ("Training Brill tagger on %d tokens..." %
1023 len(train_tokens))
1024
1025
1026 rules_by_position = [{} for tok in train_tokens]
1027
1028
1029 return Brill(self._initial_tagger, rules)
1030
1031
1032
1033
1034
1036 return [i for i in range(len(tokens))
1037 if tokens[i][1] !=
1038 train_tokens[i][1] ]
1039
1040
1041 -def errorList (train_tokens, tokens, radius=2):
1042 """
1043 Returns a list of human-readable strings indicating the errors in the
1044 given tagging of the corpus.
1045
1046 @param train_tokens: The correct tagging of the corpus
1047 @type train_tokens: C{list} of C{tuple}
1048 @param tokens: The tagged corpus
1049 @type tokens: C{list} of C{tuple}
1050 @param radius: How many tokens on either side of a wrongly-tagged token
1051 to include in the error string. For example, if C{radius}=2, each error
1052 string will show the incorrect token plus two tokens on either side.
1053 @type radius: int
1054 """
1055 errors = []
1056 indices = _errorPositions(train_tokens, tokens)
1057 tokenLen = len(tokens)
1058 for i in indices:
1059 ei = tokens[i][1].rjust(3) + " -> " \
1060 + train_tokens[i][1].rjust(3) + ": "
1061 for j in range( max(i-radius, 0), min(i+radius+1, tokenLen) ):
1062 if tokens[j][0] == tokens[j][1]:
1063 s = tokens[j][0]
1064 else:
1065 s = tokens[j][0] + "/" + tokens[j][1]
1066
1067 if j == i:
1068 ei += "**"+s+"** "
1069 else:
1070 ei += s + " "
1071 errors.append(ei)
1072
1073 return errors
1074
1075
1076
1077
1078
1079 -def demo(num_sents=100, max_rules=200, min_score=3, error_output = "errors.out",
1080 rule_output="rules.yaml", randomize=False, train=.8, trace=3):
1081 """
1082 Brill Tagger Demonstration
1083
1084 @param num_sents: how many sentences of training and testing data to use
1085 @type num_sents: L{int}
1086 @param max_rules: maximum number of rule instances to create
1087 @type max_rules: L{int}
1088 @param min_score: the minimum score for a rule in order for it to be considered
1089 @type min_score: L{int}
1090 @param error_output: the file where errors will be saved
1091 @type error_output: L{string}
1092 @param rule_output: the file where rules will be saved
1093 @type rule_output: L{string}
1094 @param randomize: whether the training data should be a random subset of the corpus
1095 @type randomize: L{boolean}
1096 @param train: the fraction of the the corpus to be used for training (1=all)
1097 @type train: L{float}
1098 @param trace: the level of diagnostic tracing output to produce (0-3)
1099 @type trace: L{int}
1100 """
1101
1102 from nltk_lite.corpora import treebank
1103 from nltk_lite import tag
1104 from nltk_lite.tag import brill
1105
1106 NN_CD_tagger = tag.Regexp([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')])
1107
1108
1109
1110
1111 print "Loading tagged data..."
1112 sents = list(treebank.tagged())
1113 if randomize:
1114 random.seed(len(sents))
1115 random.shuffle(sents)
1116
1117 tagged_data = [t for s in sents[:num_sents] for t in s]
1118 cutoff = int(len(tagged_data)*train)
1119
1120 training_data = tagged_data[:cutoff]
1121 gold_data = tagged_data[cutoff:]
1122
1123 testing_data = [t[0] for t in gold_data]
1124
1125
1126
1127 print "Training unigram tagger:",
1128 u = tag.Unigram(backoff=NN_CD_tagger)
1129
1130
1131
1132 u.train([training_data])
1133 print("[accuracy: %f]" % tag.accuracy(u, [gold_data]))
1134
1135
1136
1137 templates = [
1138 brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)),
1139 brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)),
1140 brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)),
1141 brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)),
1142 brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)),
1143 brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)),
1144 brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)),
1145 brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)),
1146 brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)),
1147 brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1)),
1148 ]
1149
1150
1151 trainer = brill.BrillTrainer(u, templates, trace)
1152 b = trainer.train(training_data, max_rules, min_score)
1153
1154 print
1155 print("Brill accuracy: %f" % tag.accuracy(b, [gold_data]))
1156
1157 print("\nRules: ")
1158 for rule in b.rules():
1159 print(str(rule))
1160
1161 printRules = file(rule_output, 'w')
1162 yaml.dump(b, printRules)
1163 printRules.close()
1164
1165 testing_data = list(b.tag(testing_data))
1166 el = errorList(gold_data, testing_data)
1167 errorFile = file(error_output, 'w')
1168
1169 for e in el:
1170 errorFile.write(e+"\n\n")
1171 errorFile.close()
1172 print "Done; rules and errors saved to %s and %s." % (rule_output, error_output)
1173
1174 if __name__ == '__main__':
1175 demo()
1176