1
2
3
4
5
6
7
8
9
10
11
12
13 from nltk_lite.parse import *
14 from nltk_lite.featurestructure import *
15 from nltk_lite.semantics import logic
16 import string
17
18 -class Category(FeatureStructure, Nonterminal):
19 """
20 A C{Category} is a specialized feature structure, intended for use in
21 parsing. It can act as a C{Nonterminal}.
22
23 A C{Category} differs from a C{FeatureStructure} in these ways:
24
25 - Categories may not be re-entrant.
26
27 - Categories use value-based equality, while FeatureStructures use
28 identity-based equality.
29
30 - Strings in Categories are compared case-insensitively.
31
32 - Categories have one feature marked as the 'head', which prints
33 differently than other features if it has a value. For example,
34 in the C{repr()} representation of a Category, the head goes to the
35 left, on the outside of the brackets. Subclasses of C{Category}
36 may change the feature name that is designated as the head, which is
37 _head by default.
38
39 - Subclasses of C{Category} may contain a list of I{required features},
40 which are names of features whose value is None if unspecified. A
41 Category lacking a feature that is required in it will not unify with
42 any Category that has that feature. If a required feature's value is
43 C{None}, it is considered to be not present. (Mixing different
44 subclasses of C{Category} is probably a bad idea.)
45
46 - C{True} and C{False} are allowed as values. A feature named C{foo}
47 with a value of C{True} is simply expressed as C{+foo}. Similarly, if
48 it is C{False}, it is expressed as C{-foo}.
49 """
50
51 headname = '_head'
52 requiredFeatures = []
53
65
67 "@return: A list of the names of all required features."
68 return self._required
69
72
74 """
75 @return: A new Category based on this one, with its C{/} feature set to
76 C{other}.
77 """
78 temp = self.deepcopy()
79 dict = temp._features
80 dict['/'] = other
81 return self.__class__(**dict)
82
84 """
85 @return: True if C{self} and C{other} assign the same value to
86 to every feature. In particular, return true if
87 C{self[M{p}]==other[M{p}]} for every feature path M{p} such
88 that C{self[M{p}]} or C{other[M{p}]} is a base value (i.e.,
89 not a nested Category).
90 @rtype: C{bool}
91 """
92
93
94
95 if not other.__class__ == self.__class__: return False
96 if hash(self) != hash(other): return False
97 return (self.equal_values(other) == True)
98
100 return not (self == other)
101
107
109 """
110 Freezing a Category memoizes its hash value, to make comparisons on it
111 faster. After freezing, the Category and all its values are immutable.
112
113 @return: self
114 """
115 for val in self._features.values():
116 if isinstance(val, Category) and not val.frozen():
117 val.freeze()
118 self._hash = hash(self)
119 self._memorepr = self._repr({}, {})
120 self._frozen = True
121 return self
122
124 """
125 Returns whether this Category is frozen (immutable).
126
127 @rtype: C{bool}
128 """
129 return self._frozen
130
132 if self._frozen: raise "Cannot modify a frozen Category"
133 self._features[name] = value
134
136 """
137 @return: The node value corresponding to this C{Category}.
138 @rtype: C{Category}
139 """
140 return self
141
143 """
144 @return: The head of this category (the value shown outside the
145 brackets in its string representation). If there is no head, returns
146 None.
147 @rtype: C{str} or C{None}
148 """
149 return self._features.get(self.__class__.headname)
150
152 """
153 @return: A deep copy of C{self}.
154 """
155 newcopy = self.__class__()
156 features = newcopy._features
157
158
159 for (fname, fval) in self._features.items():
160 if isinstance(fval, FeatureStructure):
161 features[fname] = fval.deepcopy()
162 else:
163 features[fname] = fval
164
165 return newcopy
166
169
171 """
172 @return: a list of all features that have values.
173 """
174 return filter(lambda x: not (x in self._required and self[x] is None),
175 self._features.keys())
176
178 try:
179 return self.__getitem__(*args)
180 except IndexError:
181 return StarValue()
182
185
186
187
188
189
194
201
202
203
204
205
209
210
211
212
213
214
216 """
217 @return: A string representation of this feature structure.
218 """
219 if self._memorepr is not None: return self._memorepr
220 else: return self._repr({}, {})
221
222 - def _repr(self, reentrances, reentrance_ids):
223 segments = []
224
225 items = self.feature_names()
226 items.sort()
227
228 for fname in items:
229 if fname == self.__class__.headname: continue
230 fval = self[fname]
231 if isinstance(fval, bool):
232 if fval: segments.append('+%s' % fname)
233 else: segments.append('-%s' % fname)
234 elif not isinstance(fval, Category):
235 segments.append('%s=%r' % (fname, fval))
236 else:
237 fval_repr = fval._repr(reentrances, reentrance_ids)
238 segments.append('%s=%s' % (fname, fval_repr))
239
240 head = self._features.get(self.__class__.headname)
241 if head is None: head = ''
242 if head and not len(segments): return head
243 return '%s[%s]' % (head, ', '.join(segments))
244
245 - def _str(self, reentrances, reentrance_ids):
246
247
248
249
250
251 if len(self.feature_names()) == 0:
252 return ['[]']
253 if self.feature_names() == [self.__class__.headname]:
254 return ['%s[]' % self[self.__class__.headname]]
255
256
257
258 maxfnamelen = max([len(k) for k in self.feature_names()])
259
260 lines = []
261 items = self.feature_names()
262 items.sort()
263
264 if self.__class__.headname in items:
265 items.remove(self.__class__.headname)
266
267 for fname in items:
268 fval = self[fname]
269
270 if not isinstance(fval, FeatureStructure):
271
272 lines.append('%s = %r' % (fname.ljust(maxfnamelen), fval))
273
274 else:
275
276
277 if lines and lines[-1] != '': lines.append('')
278
279
280 fval_lines = fval._str(reentrances, reentrance_ids)
281
282
283 fval_lines = [(' '*(maxfnamelen+3))+l for l in fval_lines]
284
285
286 nameline = (len(fval_lines)-1)/2
287
288 fval_lines[nameline] = (
289 fname.ljust(maxfnamelen)+' ='+
290 fval_lines[nameline][maxfnamelen+2:])
291
292
293 lines += fval_lines
294
295
296 lines.append('')
297
298
299 if lines[-1] == '': lines = lines[:-1]
300
301
302 headline = (len(lines) - 1)/2
303 if self.has_feature(self.__class__.headname):
304 head = self[self.__class__.headname]
305 else:
306 head = ''
307 maxlen = max([len(line) for line in lines])
308 for l in range(len(lines)):
309 line = lines[l]
310 if l == headline:
311 lines[l] = ('%s[ %s%s ]' % (head, line, ' '*(maxlen-len(line))))
312 else:
313 lines[l] = ('%s[ %s%s ]' % (' '*len(head), line, ' '*(maxlen-len(line))))
314
315 return lines
316
317
318
319
320
321
322
323 _PARSE_RE = {'categorystart': re.compile(r'\s*([^\s\(\)"\'\-=,\[\]]*)\s*\['),
324 'bool': re.compile(r'\s*([-\+])'),
325 'arrow': re.compile(r'\s*->\s*'),
326
327 'disjunct': re.compile(r'\s*\|\s*'),
328 'whitespace': re.compile(r'\s*')}
329 for (k, v) in FeatureStructure._PARSE_RE.iteritems():
330 assert k not in _PARSE_RE
331 _PARSE_RE[k] = v
332
333
334 - def _parse(cls, s, position=0, reentrances=None):
335 """
336 Helper function that parses a Category.
337 @param s: The string to parse.
338 @param position: The position in the string to start parsing.
339 @param reentrances: A dictionary from reentrance ids to values.
340 @return: A tuple (val, pos) of the feature structure created
341 by parsing and the position where the parsed feature
342 structure ends.
343 """
344
345 _PARSE_RE = cls._PARSE_RE
346
347
348 match = _PARSE_RE['name'].match(s, position)
349 if match is not None:
350 head = match.group(1)
351 position = match.end()
352 else: head = None
353
354
355 if position >= len(s) or s[position] != '[':
356 return cls(**{cls.headname: head}), position
357 position += 1
358
359
360
361 match = _PARSE_RE['bracket'].match(s, position)
362 if match is not None:
363 if head is None: return cls(), match.end()
364 else: return cls(**{cls.headname: head}), match.end()
365
366
367
368
369
370
371 features = {}
372 if head is not None: features[cls.headname] = head
373 while position < len(s):
374
375 name = target = val = None
376
377
378 match = _PARSE_RE['bool'].match(s, position)
379 if match is not None:
380 if match.group(1) == '+': val = True
381 else: val = False
382 position = match.end()
383
384
385 match = _PARSE_RE['name'].match(s, position)
386 if match is None: raise ValueError('feature name', position)
387 name = match.group(1)
388 position = match.end()
389
390
391 if val is None:
392 match = _PARSE_RE['assign'].match(s, position)
393 if match is None: raise ValueError('equals sign', position)
394 position = match.end()
395 val, position = cls._parseval(s, position, reentrances)
396 features[name] = val
397
398
399 match = _PARSE_RE['bracket'].match(s, position)
400 if match is not None:
401 return cls(**features), match.end()
402
403
404 match = _PARSE_RE['comma'].match(s, position)
405 if match is None: raise ValueError('comma', position)
406 position = match.end()
407
408
409 raise ValueError('close bracket', position)
410
411
412 - def _parseval(cls, s, position, reentrances):
413 """
414 Helper function that parses a feature value. Currently
415 supports: None, bools, integers, variables, strings, nested feature
416 structures.
417 @param s: The string to parse.
418 @param position: The position in the string to start parsing.
419 @param reentrances: A dictionary from reentrance ids to values.
420 @return: A tuple (val, pos) of the value created by parsing
421 and the position where the parsed value ends.
422 """
423
424 _PARSE_RE = cls._PARSE_RE
425
426
427 if position == len(s): raise ValueError('value', position)
428
429
430 match = _PARSE_RE['application'].match(s, position)
431 if match is not None:
432 fun = ParserSubstitute(match.group(2)).next()
433 arg = ParserSubstitute(match.group(3)).next()
434 return ApplicationExpressionSubst(fun, arg), match.end()
435
436
437 match = _PARSE_RE['semantics'].match(s, position)
438 if match is not None:
439 return ParserSubstitute(match.group(1)).next(), match.end()
440
441
442 if s[position] in "'\"":
443 start = position
444 quotemark = s[position:position+1]
445 position += 1
446 while 1:
447 match = _PARSE_RE['stringmarker'].search(s, position)
448 if not match: raise ValueError('close quote', position)
449 position = match.end()
450 if match.group() == '\\': position += 1
451 elif match.group() == quotemark:
452 return eval(s[start:position]), position
453
454
455 if _PARSE_RE['categorystart'].match(s, position) is not None:
456 return cls._parse(s, position, reentrances)
457
458
459 match = _PARSE_RE['var'].match(s, position)
460 if match is not None:
461 return FeatureVariable.parse(match.group()), match.end()
462
463
464 match = _PARSE_RE['none'].match(s, position)
465 if match is not None:
466 return None, match.end()
467
468
469 match = _PARSE_RE['int'].match(s, position)
470 if match is not None:
471 return int(match.group()), match.end()
472
473
474 match = _PARSE_RE['symbol'].match(s, position)
475 if match is not None:
476 return cls(**{cls.headname: match.group()}), match.end()
477
478
479 raise ValueError('value', position)
480
481
482
484 """
485 Parse a L{CFG} line involving C{Categories}. A line has this form:
486
487 C{lhs -> rhs | rhs | ...}
488
489 where C{lhs} is a Category, and each C{rhs} is a sequence of
490 Categories.
491
492 @returns: a list of C{Productions}, one for each C{rhs}.
493 """
494 _PARSE_RE = cls._PARSE_RE
495 position = 0
496 try:
497 lhs, position = cls._parse(s, position)
498 except ValueError, e:
499 estr = ('Error parsing field structure\n\n\t' +
500 s + '\n\t' + ' '*e.args[1] + '^ ' +
501 'Expected %s\n' % e.args[0])
502 raise ValueError, estr
503 lhs.freeze()
504
505 match = _PARSE_RE['arrow'].match(s, position)
506 if match is None: raise ValueError('arrow', position)
507 else: position = match.end()
508 rules = []
509 while position < len(s):
510 rhs = []
511 while position < len(s) and _PARSE_RE['disjunct'].match(s, position) is None:
512 try:
513 val, position = cls._parseval(s, position, {})
514 except ValueError, e:
515 estr = ('Error parsing field structure\n\n\t' +
516 s + '\n\t' + ' '*e.args[1] + '^ ' +
517 'Expected %s\n' % e.args[0])
518 raise ValueError, estr
519 if isinstance(val, Category): val.freeze()
520 rhs.append(val)
521 position = _PARSE_RE['whitespace'].match(s, position).end()
522 rules.append(Production(lhs, rhs))
523
524 if position < len(s):
525 match = _PARSE_RE['disjunct'].match(s, position)
526 position = match.end()
527
528
529
530 if len(rules) == 0: rules = [Production(lhs, ())]
531 return rules
532
533 _parseval=classmethod(_parseval)
534 _parse=classmethod(_parse)
535 parse_rules=classmethod(parse_rules)
536
537
539 """
540 A class of C{Category} for use in parsing.
541
542 The name of the head feature in a C{GrammarCategory} is C{pos} (for "part
543 of speech"). There is one required feature, C{/}, which is intended to
544 indicate a type of phrase that is missing from the grammatical structure.
545
546 In addition, GrammarCategories are displayed and parse differently, to be
547 consistent with NLP teaching materials: the value of the C{/} feature can
548 be written with a slash after the right bracket, so that the string
549 representation looks like: C{head[...]/value}.
550
551 An example of a C{GrammarCategory} is C{VP[+fin]/NP}, for a verb phrase
552 that is finite and has an omitted noun phrase inside it.
553 """
554
555 headname = 'pos'
556 requiredFeatures = ['/']
557
558 - def _repr(self, reentrances, reentrance_ids):
559 segments = []
560
561 items = self.feature_names()
562 items.sort()
563
564 for fname in items:
565 if fname == self.__class__.headname or fname == '/': continue
566 fval = self[fname]
567 if isinstance(fval, bool):
568 if fval: segments.append('+%s' % fname)
569 else: segments.append('-%s' % fname)
570 elif isinstance(fval, logic.Expression):
571 segments.append('%s=%r' % (fname, fval.__str__()))
572 elif not isinstance(fval, Category):
573 segments.append('%s=%r' % (fname, fval))
574 else:
575 fval_repr = fval._repr(reentrances, reentrance_ids)
576 segments.append('%s=%s' % (fname, fval_repr))
577
578 head = self._features.get(self.__class__.headname)
579 if head is None: head = ''
580 if not len(segments): features = ''
581 else: features = "[%s]" % ', '.join(segments)
582 slash = self._features.get('/')
583 if slash is None: slash = ''
584 else: slash = '/%r' % slash
585
586 return '%s%s%s' % (head, features, slash)
587
588
589
590
591 _PARSE_RE = {'semantics': re.compile(r'<([^>]+)>'),
592 'application': re.compile(r'<(app)\((\?[a-z][a-z]*)\s*,\s*(\?[a-z][a-z]*)\)>'),
593 'slash': re.compile(r'\s*/\s*')}
594 for (k, v) in Category._PARSE_RE.iteritems():
595 assert k not in _PARSE_RE
596 _PARSE_RE[k] = v
597
598 _PARSE_RE['name'] = re.compile(r'\s*([^\s\(\)"\'\-=,\[\]/]+)\s*')
599 _PARSE_RE['categorystart'] = re.compile(r'\s*([^\s\(\)"\'\-=,\[\]/]*)\s*(\[|/)')
600
601
602 - def _parse(cls, s, position=0, reentrances=None):
603
604 _PARSE_RE = cls._PARSE_RE
605
606 features = {}
607
608
609 match = _PARSE_RE['name'].match(s, position)
610 if match is not None:
611 features[cls.headname] = match.group(1)
612 position = match.end()
613
614
615
616 if position < len(s) and s[position] == '[':
617 position += 1
618
619
620
621
622
623
624 while True:
625 if not position < len(s):
626 raise ValueError('close bracket', position)
627
628
629 name = target = val = None
630
631
632 match = _PARSE_RE['bracket'].match(s, position)
633 if match is not None:
634 position = match.end()
635
636 break
637
638
639 match = _PARSE_RE['bool'].match(s, position)
640 if match is not None:
641 if match.group(1) == '+': val = True
642 else: val = False
643 position = match.end()
644
645
646 match = _PARSE_RE['name'].match(s, position)
647 if match is None: raise ValueError('feature name', position)
648 name = match.group(1)
649 position = match.end()
650
651
652 if val is None:
653 match = _PARSE_RE['assign'].match(s, position)
654 if match is None: raise ValueError('equals sign', position)
655 position = match.end()
656
657 val, position = cls._parseval(s, position, reentrances)
658 features[name] = val
659
660
661 match = _PARSE_RE['bracket'].match(s, position)
662 if match is not None:
663 position = match.end()
664
665 break
666
667
668 match = _PARSE_RE['comma'].match(s, position)
669 if match is None: raise ValueError('comma', position)
670 position = match.end()
671
672
673 match = _PARSE_RE['slash'].match(s, position)
674 if match is not None:
675 position = match.end()
676 slash, position = cls._parseval(s, position, 0)
677 features['/'] = slash
678
679 return cls(**features), position
680
681 _parse = classmethod(_parse)
682
684 """
685 A lambda calculus expression parser, extended to create application
686 expressions which support the SubstituteBindingsI interface.
687 """
690
692 """
693 A lambda application expression, extended to implement the
694 SubstituteBindingsI interface.
695 """
706
707
708
709
710
768
769
782
783 if __name__ == '__main__':
784 demo()
785