Package nltk_lite :: Package chunk :: Module convert
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.chunk.convert

  1  # Natural Language Toolkit: Chunk format conversions 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> (minor additions) 
  6  # URL: <http://nltk.sf.net> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  from nltk_lite.chunk import * 
 10  from nltk_lite import tokenize 
 11  from nltk_lite.parse.tree import Tree 
 12  import re, string 
 13   
14 -def tagstr2tree(s, chunk_node="NP", top_node="S"):
15 """ 16 Divide a string of bracketted tagged text into 17 chunks and unchunked tokens, and produce a C{Tree}. 18 Chunks are marked by square brackets (C{[...]}). Words are 19 delimited by whitespace, and each word should have the form 20 C{I{text}/I{tag}}. Words that do not contain a slash are 21 assigned a C{tag} of C{None}. 22 23 @return: A tree corresponding to the string representation. 24 @rtype: C{tree} 25 @param s: The string to be converted 26 @type s: C{string} 27 @param chunk_node: The label to use for chunk nodes 28 @type chunk_node: C{string} 29 @param top_node: The label to use for the root of the tree 30 @type top_node: C{string} 31 """ 32 33 WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+') 34 VALID = re.compile(r'^([^\[\]]+|\[[^\[\]]*\])*$') 35 36 if not VALID.match(s): 37 raise ValueError, 'Invalid token string (bad brackets)' 38 39 stack = [Tree(top_node, [])] 40 for match in WORD_OR_BRACKET.finditer(s): 41 text = match.group() 42 if text[0] == '[': 43 chunk = Tree(chunk_node, []) 44 stack[-1].append(chunk) 45 stack.append(chunk) 46 elif text[0] == ']': 47 stack.pop() 48 else: 49 slash = text.rfind('/') 50 if slash >= 0: 51 tok = (text[:slash], text[slash+1:]) 52 else: 53 tok = (text, None) 54 stack[-1].append(tok) 55 56 return stack[0]
57 58 ### CONLL 59 60 _LINE_RE = re.compile('(\S+)\s+(\S+)\s+([IOB])-?(\S+)?')
61 -def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), top_node="S"):
62 """ 63 Convert a CoNLL IOB string into a tree. Uses the specified chunk types 64 (defaults to NP, PP and VP), and creates a tree rooted at a node 65 labeled S (by default). 66 67 @param s: The CoNLL string to be converted. 68 @type s: C{string} 69 @param chunk_types: The chunk types to be converted. 70 @type chunk_types: C{tuple} 71 @param top_node: The node label to use for the root. 72 @type chunk_types: C{string} 73 @return: A chunk structure for a single sentence 74 encoded in the given CONLL 2000 style string. 75 @rtype: L{Tree} 76 """ 77 78 stack = [Tree(top_node, [])] 79 80 for lineno, line in enumerate(tokenize.line(s)): 81 82 # Decode the line. 83 match = _LINE_RE.match(line) 84 if match is None: 85 raise ValueError, 'Error on line %d' % lineno 86 (word, tag, state, chunk_type) = match.groups() 87 88 # If it's a chunk type we don't care about, treat it as O. 89 if (chunk_types is not None and 90 chunk_type not in chunk_types): 91 state = 'O' 92 93 # For "Begin"/"Outside", finish any completed chunks - 94 # also do so for "Inside" which don't match the previous token. 95 mismatch_I = state == 'I' and chunk_type != stack[-1].node 96 if state in 'BO' or mismatch_I: 97 if len(stack) == 2: stack.pop() 98 99 # For "Begin", start a new chunk. 100 if state == 'B' or mismatch_I: 101 chunk = Tree(chunk_type, []) 102 stack[-1].append(chunk) 103 stack.append(chunk) 104 105 # Add the new word token. 106 stack[-1].append((word, tag)) 107 108 return stack[0]
109
110 -def tree2conlltags(t):
111 """ 112 Convert a tree to the CoNLL IOB tag format 113 114 @param t: The tree to be converted. 115 @type t: C{Tree} 116 @return: A list of 3-tuples containing word, tag and IOB tag. 117 @rtype: C{list} of C{tuple} 118 """ 119 120 tags = [] 121 for child in t: 122 try: 123 category = child.node 124 prefix = "B-" 125 for contents in child: 126 if isinstance(contents, Tree): 127 raise ValueError, "Tree is too deeply nested to be printed in CoNLL format" 128 tags.append((contents[0], contents[1], prefix+category)) 129 prefix = "I-" 130 except AttributeError: 131 tags.append((child[0], child[1], "O")) 132 return tags
133
134 -def tree2conllstr(t):
135 """ 136 Convert a tree to the CoNLL IOB string format 137 138 @param t: The tree to be converted. 139 @type t: C{Tree} 140 @return: A multiline string where each line contains a word, tag and IOB tag. 141 @rtype: C{string} 142 """ 143 lines = [string.join(token) for token in tree2conlltags(t)] 144 return '\n'.join(lines)
145 146 ### IEER 147 148 _IEER_DOC_RE = re.compile(r'<DOC>\s*' 149 r'(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?' 150 r'(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?' 151 r'(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?' 152 r'<BODY>\s*' 153 r'(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?' 154 r'<TEXT>(?P<text>.*?)</TEXT>\s*' 155 r'</BODY>\s*</DOC>\s*', re.DOTALL) 156 157 _IEER_TYPE_RE = re.compile('<b_\w+\s+[^>]*?type="(?P<type>\w+)"') 158
159 -def _ieer_read_text(s, top_node):
160 stack = [Tree(top_node, [])] 161 # s will be None if there is no headline in the text 162 # return the empty list in place of a Tree 163 if s is None: 164 return [] 165 for piece_m in re.finditer('<[^>]+>|[^\s<]+', s): 166 piece = piece_m.group() 167 try: 168 if piece.startswith('<b_'): 169 m = _IEER_TYPE_RE.match(piece) 170 if m is None: print 'XXXX', piece 171 chunk = Tree(m.group('type'), []) 172 stack[-1].append(chunk) 173 stack.append(chunk) 174 elif piece.startswith('<e_'): 175 stack.pop() 176 # elif piece.startswith('<'): 177 # print "ERROR:", piece 178 # raise ValueError # Unexpected HTML 179 else: 180 stack[-1].append(piece) 181 except (IndexError, ValueError): 182 raise ValueError('Bad IEER string (error at character %d)' % 183 piece_m.start()) 184 if len(stack) != 1: 185 raise ValueError('Bad IEER string') 186 return stack[0]
187
188 -def ieerstr2tree(s, chunk_types = ['LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION', 189 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'], top_node="S"):
190 """ 191 Convert a string of chunked tagged text in the IEER named 192 entity format into a chunk structure. Chunks are of several 193 types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL, 194 PERCENT, MONEY, and MEASURE. 195 196 @return: A chunk structure containing the chunked tagged text that is 197 encoded in the given IEER style string. 198 @rtype: L{Tree} 199 """ 200 201 # Try looking for a single document. If that doesn't work, then just 202 # treat everything as if it was within the <TEXT>...</TEXT>. 203 m = _IEER_DOC_RE.match(s) 204 if m: 205 return { 206 'text': _ieer_read_text(m.group('text'), top_node), 207 'docno': m.group('docno'), 208 'doctype': m.group('doctype'), 209 'date_time': m.group('date_time'), 210 #'headline': m.group('headline') 211 # we want to capture NEs in the headline too! 212 'headline': _ieer_read_text(m.group('headline'), top_node), 213 } 214 else: 215 return _ieer_read_text(s, top_node)
216 217
218 -def demo():
219 220 s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./." 221 from nltk_lite import chunk 222 t = chunk.tagstr2tree(s, chunk_node='NP') 223 print t.pp() 224 print 225 226 s = """ 227 These DT B-NP 228 research NN I-NP 229 protocols NNS I-NP 230 offer VBP B-VP 231 to TO B-PP 232 the DT B-NP 233 patient NN I-NP 234 not RB O 235 only RB O 236 the DT B-NP 237 very RB I-NP 238 best JJS I-NP 239 therapy NN I-NP 240 which WDT B-NP 241 we PRP B-NP 242 have VBP B-VP 243 established VBN I-VP 244 today NN B-NP 245 but CC B-NP 246 also RB I-NP 247 the DT B-NP 248 hope NN I-NP 249 of IN B-PP 250 something NN B-NP 251 still RB B-ADJP 252 better JJR I-ADJP 253 . . O 254 """ 255 256 conll_tree = conllstr2tree(s, chunk_types=('NP', 'PP')) 257 print conll_tree.pp() 258 259 # Demonstrate CoNLL output 260 print "CoNLL output:" 261 print chunk.tree2conllstr(conll_tree) 262 print
263 264 265 if __name__ == '__main__': 266 demo() 267