1
2
3
4
5
6
7
8
9 from nltk_lite.chunk import *
10 from nltk_lite import tokenize
11 from nltk_lite.parse.tree import Tree
12 import re, string
13
57
58
59
60 _LINE_RE = re.compile('(\S+)\s+(\S+)\s+([IOB])-?(\S+)?')
61 -def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), top_node="S"):
62 """
63 Convert a CoNLL IOB string into a tree. Uses the specified chunk types
64 (defaults to NP, PP and VP), and creates a tree rooted at a node
65 labeled S (by default).
66
67 @param s: The CoNLL string to be converted.
68 @type s: C{string}
69 @param chunk_types: The chunk types to be converted.
70 @type chunk_types: C{tuple}
71 @param top_node: The node label to use for the root.
72 @type chunk_types: C{string}
73 @return: A chunk structure for a single sentence
74 encoded in the given CONLL 2000 style string.
75 @rtype: L{Tree}
76 """
77
78 stack = [Tree(top_node, [])]
79
80 for lineno, line in enumerate(tokenize.line(s)):
81
82
83 match = _LINE_RE.match(line)
84 if match is None:
85 raise ValueError, 'Error on line %d' % lineno
86 (word, tag, state, chunk_type) = match.groups()
87
88
89 if (chunk_types is not None and
90 chunk_type not in chunk_types):
91 state = 'O'
92
93
94
95 mismatch_I = state == 'I' and chunk_type != stack[-1].node
96 if state in 'BO' or mismatch_I:
97 if len(stack) == 2: stack.pop()
98
99
100 if state == 'B' or mismatch_I:
101 chunk = Tree(chunk_type, [])
102 stack[-1].append(chunk)
103 stack.append(chunk)
104
105
106 stack[-1].append((word, tag))
107
108 return stack[0]
109
133
135 """
136 Convert a tree to the CoNLL IOB string format
137
138 @param t: The tree to be converted.
139 @type t: C{Tree}
140 @return: A multiline string where each line contains a word, tag and IOB tag.
141 @rtype: C{string}
142 """
143 lines = [string.join(token) for token in tree2conlltags(t)]
144 return '\n'.join(lines)
145
146
147
148 _IEER_DOC_RE = re.compile(r'<DOC>\s*'
149 r'(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?'
150 r'(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?'
151 r'(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?'
152 r'<BODY>\s*'
153 r'(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?'
154 r'<TEXT>(?P<text>.*?)</TEXT>\s*'
155 r'</BODY>\s*</DOC>\s*', re.DOTALL)
156
157 _IEER_TYPE_RE = re.compile('<b_\w+\s+[^>]*?type="(?P<type>\w+)"')
158
159 -def _ieer_read_text(s, top_node):
160 stack = [Tree(top_node, [])]
161
162
163 if s is None:
164 return []
165 for piece_m in re.finditer('<[^>]+>|[^\s<]+', s):
166 piece = piece_m.group()
167 try:
168 if piece.startswith('<b_'):
169 m = _IEER_TYPE_RE.match(piece)
170 if m is None: print 'XXXX', piece
171 chunk = Tree(m.group('type'), [])
172 stack[-1].append(chunk)
173 stack.append(chunk)
174 elif piece.startswith('<e_'):
175 stack.pop()
176
177
178
179 else:
180 stack[-1].append(piece)
181 except (IndexError, ValueError):
182 raise ValueError('Bad IEER string (error at character %d)' %
183 piece_m.start())
184 if len(stack) != 1:
185 raise ValueError('Bad IEER string')
186 return stack[0]
187
188 -def ieerstr2tree(s, chunk_types = ['LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION',
189 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'], top_node="S"):
190 """
191 Convert a string of chunked tagged text in the IEER named
192 entity format into a chunk structure. Chunks are of several
193 types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,
194 PERCENT, MONEY, and MEASURE.
195
196 @return: A chunk structure containing the chunked tagged text that is
197 encoded in the given IEER style string.
198 @rtype: L{Tree}
199 """
200
201
202
203 m = _IEER_DOC_RE.match(s)
204 if m:
205 return {
206 'text': _ieer_read_text(m.group('text'), top_node),
207 'docno': m.group('docno'),
208 'doctype': m.group('doctype'),
209 'date_time': m.group('date_time'),
210
211
212 'headline': _ieer_read_text(m.group('headline'), top_node),
213 }
214 else:
215 return _ieer_read_text(s, top_node)
216
217
219
220 s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./."
221 from nltk_lite import chunk
222 t = chunk.tagstr2tree(s, chunk_node='NP')
223 print t.pp()
224 print
225
226 s = """
227 These DT B-NP
228 research NN I-NP
229 protocols NNS I-NP
230 offer VBP B-VP
231 to TO B-PP
232 the DT B-NP
233 patient NN I-NP
234 not RB O
235 only RB O
236 the DT B-NP
237 very RB I-NP
238 best JJS I-NP
239 therapy NN I-NP
240 which WDT B-NP
241 we PRP B-NP
242 have VBP B-VP
243 established VBN I-VP
244 today NN B-NP
245 but CC B-NP
246 also RB I-NP
247 the DT B-NP
248 hope NN I-NP
249 of IN B-PP
250 something NN B-NP
251 still RB B-ADJP
252 better JJR I-ADJP
253 . . O
254 """
255
256 conll_tree = conllstr2tree(s, chunk_types=('NP', 'PP'))
257 print conll_tree.pp()
258
259
260 print "CoNLL output:"
261 print chunk.tree2conllstr(conll_tree)
262 print
263
264
265 if __name__ == '__main__':
266 demo()
267