Package nltk_lite :: Package parse :: Module pchart
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.parse.pchart

  1  # Natural Language Toolkit: Probabilistic Chart Parsers 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> 
  6  # URL: <http://nltk.sf.net> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  """ 
 10  Classes and interfaces for associating probabilities with tree 
 11  structures that represent the internal organization of a text.  The 
 12  probabilistic parser module defines C{BottomUpChartParse}. 
 13   
 14  C{BottomUpChartParse} is an abstract class that implements a 
 15  bottom-up chart parser for C{PCFG}s.  It maintains a queue of edges, 
 16  and adds them to the chart one at a time.  The ordering of this queue 
 17  is based on the probabilities associated with the edges, allowing the 
 18  parser to expand more likely edges before less likely ones.  Each 
 19  subclass implements a different queue ordering, producing different 
 20  search strategies.  Currently the following subclasses are defined: 
 21   
 22    - C{InsideParse} searches edges in decreasing order of 
 23      their trees' inside probabilities. 
 24    - C{RandomParse} searches edges in random order. 
 25    - C{LongestParse} searches edges in decreasing order of their 
 26      location's length. 
 27   
 28  The C{BottomUpChartParse} constructor has an optional argument beam_size. 
 29  If non-zero, this controls the size of the beam (aka the edge queue). 
 30  This option is most useful with InsideParse. 
 31  """ 
 32   
 33  ##////////////////////////////////////////////////////// 
 34  ##  Bottom-Up PCFG Chart Parser 
 35  ##////////////////////////////////////////////////////// 
 36   
 37  # [XX] This might not be implemented quite right -- it would be better 
 38  # to associate probabilities with child pointer lists. 
 39   
 40  from nltk_lite.parse import * 
 41   
 42  # Probabilistic edges 
43 -class ProbabilisticLeafEdge(LeafEdge):
44 - def prob(self): return 1.0
45
46 -class ProbabilisticTreeEdge(TreeEdge):
47 - def __init__(self, prob, *args, **kwargs):
48 self._prob = prob 49 TreeEdge.__init__(self, *args, **kwargs)
50 - def prob(self): return self._prob
51 - def __cmp__(self, other):
52 if self._prob != other.prob(): return -1 53 return TreeEdge.__cmp__(self, other)
54 - def from_production(production, index, p):
55 return ProbabilisticTreeEdge(p, (index, index), production.lhs(), 56 production.rhs(), 0)
57 from_production = staticmethod(from_production)
58 59 # Rules using probabilistic edges
60 -class BottomUpInitRule(AbstractChartRule):
61 NUM_EDGES=0
62 - def apply_iter(self, chart, grammar):
63 for index in range(chart.num_leaves()): 64 new_edge = ProbabilisticLeafEdge(chart.leaf(index), index) 65 if chart.insert(new_edge, ()): 66 yield new_edge
67
68 -class BottomUpPredictRule(AbstractChartRule):
69 NUM_EDGES=1
70 - def apply_iter(self, chart, grammar, edge):
71 if edge.is_incomplete(): return 72 for prod in grammar.productions(): 73 if edge.lhs() == prod.rhs()[0]: 74 new_edge = ProbabilisticTreeEdge.from_production(prod, edge.start(), prod.prob()) 75 if chart.insert(new_edge, ()): 76 yield new_edge
77
78 -class ProbabilisticFundamentalRule(AbstractChartRule):
79 NUM_EDGES=2
80 - def apply_iter(self, chart, grammar, left_edge, right_edge):
81 # Make sure the rule is applicable. 82 if not (left_edge.end() == right_edge.start() and 83 left_edge.next() == right_edge.lhs() and 84 left_edge.is_incomplete() and right_edge.is_complete()): 85 return 86 87 # Construct the new edge. 88 p = left_edge.prob() * right_edge.prob() 89 new_edge = ProbabilisticTreeEdge(p, 90 span=(left_edge.start(), right_edge.end()), 91 lhs=left_edge.lhs(), rhs=left_edge.rhs(), 92 dot=left_edge.dot()+1) 93 94 # Add it to the chart, with appropriate child pointers. 95 changed_chart = False 96 for cpl1 in chart.child_pointer_lists(left_edge): 97 if chart.insert(new_edge, cpl1+(right_edge,)): 98 changed_chart = True 99 100 # If we changed the chart, then generate the edge. 101 if changed_chart: yield new_edge
102
103 -class SingleEdgeProbabilisticFundamentalRule(AbstractChartRule):
104 NUM_EDGES=1 105 106 _fundamental_rule = ProbabilisticFundamentalRule() 107
108 - def apply_iter(self, chart, grammar, edge1):
109 fr = self._fundamental_rule 110 if edge1.is_incomplete(): 111 # edge1 = left_edge; edge2 = right_edge 112 for edge2 in chart.select(start=edge1.end(), is_complete=True, 113 lhs=edge1.next()): 114 for new_edge in fr.apply_iter(chart, grammar, edge1, edge2): 115 yield new_edge 116 else: 117 # edge2 = left_edge; edge1 = right_edge 118 for edge2 in chart.select(end=edge1.start(), is_complete=False, 119 next=edge1.lhs()): 120 for new_edge in fr.apply_iter(chart, grammar, edge2, edge1): 121 yield new_edge
122
123 - def __str__(self): return 'Fundamental Rule'
124
125 -class BottomUpChartParse(AbstractParse):
126 """ 127 An abstract bottom-up parser for C{PCFG}s that uses a C{Chart} to 128 record partial results. C{BottomUpChartParse} maintains a 129 queue of edges that can be added to the chart. This queue is 130 initialized with edges for each token in the text that is being 131 parsed. C{BottomUpChartParse} inserts these edges into the 132 chart one at a time, starting with the most likely edges, and 133 proceeding to less likely edges. For each edge that is added to 134 the chart, it may become possible to insert additional edges into 135 the chart; these are added to the queue. This process continues 136 until enough complete parses have been generated, or until the 137 queue is empty. 138 139 The sorting order for the queue is not specified by 140 C{BottomUpChartParse}. Different sorting orders will result 141 in different search strategies. The sorting order for the queue 142 is defined by the method C{sort_queue}; subclasses are required 143 to provide a definition for this method. 144 145 @type _grammar: C{PCFG} 146 @ivar _grammar: The grammar used to parse sentences. 147 @type _trace: C{int} 148 @ivar _trace: The level of tracing output that should be generated 149 when parsing a text. 150 """
151 - def __init__(self, grammar, beam_size=0, trace=0):
152 """ 153 Create a new C{BottomUpChartParse}, that uses C{grammar} 154 to parse texts. 155 156 @type grammar: C{PCFG} 157 @param grammar: The grammar used to parse texts. 158 @type beam_size: C{int} 159 @param beam_size: The maximum length for the parser's edge queue. 160 @type trace: C{int} 161 @param trace: The level of tracing that should be used when 162 parsing a text. C{0} will generate no tracing output; 163 and higher numbers will produce more verbose tracing 164 output. 165 """ 166 self._grammar = grammar 167 self.beam_size = beam_size 168 self._trace = trace 169 AbstractParse.__init__(self)
170
171 - def trace(self, trace=2):
172 """ 173 Set the level of tracing output that should be generated when 174 parsing a text. 175 176 @type trace: C{int} 177 @param trace: The trace level. A trace level of C{0} will 178 generate no tracing output; and higher trace levels will 179 produce more verbose tracing output. 180 @rtype: C{None} 181 """ 182 self._trace = trace
183
184 - def get_parse_list(self, tokens):
185 chart = Chart(list(tokens)) 186 grammar = self._grammar 187 188 # Chart parser rules. 189 bu_init = BottomUpInitRule() 190 bu = BottomUpPredictRule() 191 fr = SingleEdgeProbabilisticFundamentalRule() 192 193 # Our queue! 194 queue = [] 195 196 # Initialize the chart. 197 for e in bu_init.apply_iter(chart, grammar): 198 if self._trace>1: chart.pp_edge(e,width=2) 199 queue.append(e) 200 201 while len(queue) > 0: 202 # Re-sort the queue. 203 self.sort_queue(queue, chart) 204 205 # Prune the queue to the correct size if a beam was defined 206 if self.beam_size: 207 self._prune(queue, chart) 208 209 # Get the best edge. 210 edge = queue.pop() 211 if self._trace>0: 212 print ' %-50s [%s]' % (chart.pp_edge(edge,width=2), 213 edge.prob()) 214 215 # Apply BU & FR to it. 216 queue.extend(bu.apply(chart, grammar, edge)) 217 queue.extend(fr.apply(chart, grammar, edge)) 218 219 # Get a list of complete parses. 220 parses = chart.parses(grammar.start(), ProbabilisticTree) 221 222 # Assign probabilities to the trees. 223 prod_probs = {} 224 for prod in grammar.productions(): 225 prod_probs[prod.lhs(), prod.rhs()] = prod.prob() 226 for parse in parses: 227 self._setprob(parse, prod_probs) 228 229 # Sort by probability 230 parses.sort(lambda a,b: cmp(b.prob(), a.prob())) 231 232 return parses
233
234 - def _setprob(self, tree, prod_probs):
235 if tree.prob() is not None: return 236 237 # Get the prob of the CFG production. 238 lhs = Nonterminal(tree.node) 239 rhs = [] 240 for child in tree: 241 if isinstance(child, Tree): 242 rhs.append(Nonterminal(child.node)) 243 else: 244 rhs.append(child) 245 prob = prod_probs[lhs, tuple(rhs)] 246 247 # Get the probs of children. 248 for child in tree: 249 if isinstance(child, Tree): 250 self._setprob(child, prod_probs) 251 prob *= child.prob() 252 253 tree.set_prob(prob)
254
255 - def sort_queue(self, queue, chart):
256 """ 257 Sort the given queue of C{Edge}s, placing the edge that should 258 be tried first at the beginning of the queue. This method 259 will be called after each C{Edge} is added to the queue. 260 261 @param queue: The queue of C{Edge}s to sort. Each edge in 262 this queue is an edge that could be added to the chart by 263 the fundamental rule; but that has not yet been added. 264 @type queue: C{list} of C{Edge} 265 @param chart: The chart being used to parse the text. This 266 chart can be used to provide extra information for sorting 267 the queue. 268 @type chart: C{Chart} 269 @rtype: C{None} 270 """ 271 raise AssertionError, "BottomUpChartParse is an abstract class"
272
273 - def _prune(self, queue, chart):
274 """ Discard items in the queue if the queue is longer than the beam.""" 275 if len(queue) > self.beam_size: 276 split = len(queue)-self.beam_size 277 if self._trace > 2: 278 for edge in queue[:split]: 279 print ' %-50s [DISCARDED]' % chart.pp_edge(edge,2) 280 del queue[:split]
281
282 -class InsideParse(BottomUpChartParse):
283 """ 284 A bottom-up parser for C{PCFG}s that tries edges in descending 285 order of the inside probabilities of their trees. The X{inside 286 probability} of a tree is simply the 287 probability of the entire tree, ignoring its context. In 288 particular, the inside probability of a tree generated by 289 production M{p} with children M{c[1]}, M{c[2]}, ..., M{c[n]} is 290 P(M{p})*P(M{c[1]})*P(M{c[2]})*M{...}*P(M{c[n]}); and the inside 291 probability of a token is 1 if it is present in the text, and 0 if 292 it is absent. 293 294 This sorting order results in a type of lowest-cost-first search 295 strategy. 296 """ 297 # Inherit constructor.
298 - def sort_queue(self, queue, chart):
299 """ 300 Sort the given queue of edges, in descending order of the 301 inside probabilities of the edges' trees. 302 303 @param queue: The queue of C{Edge}s to sort. Each edge in 304 this queue is an edge that could be added to the chart by 305 the fundamental rule; but that has not yet been added. 306 @type queue: C{list} of C{Edge} 307 @param chart: The chart being used to parse the text. This 308 chart can be used to provide extra information for sorting 309 the queue. 310 @type chart: C{Chart} 311 @rtype: C{None} 312 """ 313 queue.sort(lambda e1,e2:cmp(e1.prob(), e2.prob()))
314 315 # Eventually, this will become some sort of inside-outside parser: 316 # class InsideOutsideParse(BottomUpChartParse): 317 # def __init__(self, grammar, trace=0): 318 # # Inherit docs. 319 # BottomUpChartParse.__init__(self, grammar, trace) 320 # 321 # # Find the best path from S to each nonterminal 322 # bestp = {} 323 # for production in grammar.productions(): bestp[production.lhs()]=0 324 # bestp[grammar.start()] = 1.0 325 # 326 # for i in range(len(grammar.productions())): 327 # for production in grammar.productions(): 328 # lhs = production.lhs() 329 # for elt in production.rhs(): 330 # bestp[elt] = max(bestp[lhs]*production.prob(), 331 # bestp.get(elt,0)) 332 # 333 # self._bestp = bestp 334 # for (k,v) in self._bestp.items(): print k,v 335 # 336 # def _cmp(self, e1, e2): 337 # return cmp(e1.structure()[PROB]*self._bestp[e1.lhs()], 338 # e2.structure()[PROB]*self._bestp[e2.lhs()]) 339 # 340 # def sort_queue(self, queue, chart): 341 # queue.sort(self._cmp) 342 343 import random
344 -class RandomParse(BottomUpChartParse):
345 """ 346 A bottom-up parser for C{PCFG}s that tries edges in random order. 347 This sorting order results in a random search strategy. 348 """ 349 # Inherit constructor
350 - def sort_queue(self, queue, chart):
351 i = random.randint(0, len(queue)-1) 352 (queue[-1], queue[i]) = (queue[i], queue[-1])
353
354 -class UnsortedParse(BottomUpChartParse):
355 """ 356 A bottom-up parser for C{PCFG}s that tries edges in whatever order. 357 """ 358 # Inherit constructor
359 - def sort_queue(self, queue, chart): return
360
361 -class LongestParse(BottomUpChartParse):
362 """ 363 A bottom-up parser for C{PCFG}s that tries longer edges before 364 shorter ones. This sorting order results in a type of best-first 365 search strategy. 366 """ 367 # Inherit constructor
368 - def sort_queue(self, queue, chart):
369 queue.sort(lambda e1,e2: cmp(e1.length(), e2.length()))
370 371 ##////////////////////////////////////////////////////// 372 ## Test Code 373 ##////////////////////////////////////////////////////// 374
375 -def demo():
376 """ 377 A demonstration of the probabilistic parsers. The user is 378 prompted to select which demo to run, and how many parses should 379 be found; and then each parser is run on the same demo, and a 380 summary of the results are displayed. 381 """ 382 import sys, time 383 from nltk_lite import tokenize 384 from nltk_lite.parse import cfg, pcfg, pchart 385 386 # Define two demos. Each demo has a sentence and a grammar. 387 demos = [('I saw John with my cookie', pcfg.toy1), 388 ('the boy saw Jack with Bob under the table with a telescope', 389 pcfg.toy2)] 390 391 # Ask the user which demo they want to use. 392 print 393 for i in range(len(demos)): 394 print '%3s: %s' % (i+1, demos[i][0]) 395 print ' %r' % demos[i][1] 396 print 397 print 'Which demo (%d-%d)? ' % (1, len(demos)), 398 try: 399 snum = int(sys.stdin.readline().strip())-1 400 sent, grammar = demos[snum] 401 except: 402 print 'Bad sentence number' 403 return 404 405 # Tokenize the sentence. 406 tokens = list(tokenize.whitespace(sent)) 407 408 # Define a list of parsers. We'll use all parsers. 409 parsers = [ 410 pchart.InsideParse(grammar), 411 pchart.RandomParse(grammar), 412 pchart.UnsortedParse(grammar), 413 pchart.LongestParse(grammar), 414 pchart.InsideParse(grammar, beam_size = len(tokens)+1) # was BeamParse 415 ] 416 417 # Run the parsers on the tokenized sentence. 418 times = [] 419 average_p = [] 420 num_parses = [] 421 all_parses = {} 422 for parser in parsers: 423 print '\ns: %s\nparser: %s\ngrammar: %s' % (sent,parser,pcfg) 424 parser.trace(3) 425 t = time.time() 426 parses = parser.get_parse_list(tokens) 427 times.append(time.time()-t) 428 if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) 429 else: p = 0 430 average_p.append(p) 431 num_parses.append(len(parses)) 432 for p in parses: all_parses[p.freeze()] = 1 433 434 # Print some summary statistics 435 print 436 print ' Parser Beam | Time (secs) # Parses Average P(parse)' 437 print '------------------------+------------------------------------------' 438 for i in range(len(parsers)): 439 print '%18s %4d |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__, 440 parsers[i].beam_size, 441 times[i],num_parses[i],average_p[i]) 442 parses = all_parses.keys() 443 if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) 444 else: p = 0 445 print '------------------------+------------------------------------------' 446 print '%18s |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p) 447 448 # Ask the user if we should draw the parses. 449 print 450 print 'Draw parses (y/n)? ', 451 if sys.stdin.readline().strip().lower().startswith('y'): 452 from nltk_lite.draw.tree import draw_trees 453 print ' please wait...' 454 draw_trees(*parses) 455 456 # Ask the user if we should print the parses. 457 print 458 print 'Print parses (y/n)? ', 459 if sys.stdin.readline().strip().lower().startswith('y'): 460 for parse in parses: 461 print parse
462 463 if __name__ == '__main__': 464 demo() 465