Package nltk_lite :: Package tokenize :: Module sexpr'
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.tokenize.sexpr'

 1  # Natural Language Toolkit: Tokenizers 
 2  # 
 3  # Copyright (C) 2001-2007 University of Pennsylvania 
 4  # Author: Yoav Goldberg <yoavg@cs.bgu.ac.il> 
 5  #         Steven Bird <sb@csse.unimelb.edu.au> (minor edits) 
 6  # URL: <http://nltk.sourceforge.net> 
 7  # For license information, see LICENSE.TXT 
 8   
 9  import re 
10   
11 -def sexpr(s):
12 """ 13 Tokenize the text into s-expressions. For example, the input 14 "(a b (c d)) e (f)" is tokenized into the following sequence: 15 "(a b (c d))", "e", "(f)". 16 17 @param s: the string or string iterator to be tokenized 18 @type s: C{string} or C{iter(string)} 19 @return: An iterator over tokens (each of which is an s-expression) 20 """ 21 def matching_paren(s,start=0): 22 count = 1 23 for (i,c) in enumerate(s[start+1:]): 24 if c == '(': 25 count += 1 26 elif c == ')': 27 count -= 1 28 if count == 0: 29 return i+1 30 return -1
31 32 while s: 33 s = s.strip() 34 if s[0] == '(': 35 matching_paren_pos = matching_paren(s) 36 if matching_paren_pos == -1: 37 yield s 38 s = '' 39 else: 40 yield s[0:matching_paren_pos+1] 41 s = s[matching_paren_pos+1:] 42 else: 43 space_pos = re.search("\s|$",s).start() 44 yield s[0:space_pos] 45 s = s[space_pos:] 46
47 -def demo():
48 from nltk_lite import tokenize 49 50 example = "a b d (d e (f)) r (t i) (iu a" 51 print 'Input text:' 52 print example 53 print 54 print 'Tokenize s-expressions:' 55 for x in tokenize.sexpr(example): 56 print x
57 58 if __name__ == '__main__': 59 demo() 60