Package nltk_lite :: Package tokenize :: Module simple
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.tokenize.simple

 1  # Natural Language Toolkit: Simple Tokenizers 
 2  # 
 3  # Copyright (C) 2001-2007 University of Pennsylvania 
 4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
 5  #         Steven Bird <sb@csse.unimelb.edu.au> 
 6  #         Trevor Cohn <tacohn@csse.unimelb.edu.au> 
 7  # URL: <http://nltk.sourceforge.net> 
 8  # For license information, see LICENSE.TXT 
 9   
10  """ 
11  Functions for tokenizing a text, based on a regular expression 
12  which matches tokens or gaps. 
13  """ 
14   
15  SPACE      = ' ' 
16  NEWLINE    = '\n' 
17  BLANKLINE  = '\n\n' 
18  SHOEBOXSEP = r'^\\' 
19   
20 -def space(s):
21 """ 22 Tokenize the text at a single space character. 23 24 @param s: the string or string iterator to be tokenized 25 @type s: C{string} or C{iter(string)} 26 @return: An iterator over tokens 27 """ 28 return s.split(SPACE)
29
30 -def line(s):
31 """ 32 Tokenize the text into lines. 33 34 @param s: the string or string iterator to be tokenized 35 @type s: C{string} or C{iter(string)} 36 @return: An iterator over tokens 37 """ 38 return s.split(NEWLINE)
39
40 -def blankline(s):
41 """ 42 Tokenize the text into paragraphs (separated by blank lines). 43 44 @param s: the string or string iterator to be tokenized 45 @type s: C{string} or C{iter(string)} 46 @return: An iterator over tokens 47 """ 48 return s.split(BLANKLINE)
49
50 -def shoebox(s):
51 """ 52 Tokenize a Shoebox entry into its fields (separated by backslash markers). 53 54 @param s: the string or string iterator to be tokenized 55 @type s: C{string} or C{iter(string)} 56 @return: An iterator over tokens 57 """ 58 return s.split(SHOEBOXSEP)
59 60 ##////////////////////////////////////////////////////// 61 ## Demonstration 62 ##////////////////////////////////////////////////////// 63
64 -def demo():
65 """ 66 A demonstration that shows the output of several different 67 tokenizers on the same string. 68 """ 69 # Define the test string. 70 s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." 71 print 'Input text:' 72 print `s` 73 print 74 print 'Tokenize using individual space characters:' 75 print list(space(s)) 76 print 77 print 'Tokenize by lines:' 78 print list(line(s)) 79 print
80 81 if __name__ == '__main__': 82 demo() 83