Home | Trees | Indices | Help |
|
---|
|
1 """ 2 Utilities for converting chunked treebank into format that can be 3 input to Nivre's MaltParser. 4 """ 5 from nltk_lite.corpora import get_basedir 6 from nltk_lite import tokenize 7 from itertools import islice 8 import os 911 loc = s.rfind(sep) 12 if loc >= 0: 13 word = s[:loc] 14 tag = s[loc+1:] 15 tag = tag.replace('(', '-LRB-').replace(')', '-RRB-') 16 return "%s\t%s\n" % (word, tag) 17 else: 18 return (s, None)1921 """ 22 @param files: One or more treebank files to be processed 23 @type files: L{string} or L{tuple(string)} 24 @return: iterator over lines in Malt-TAB input format 25 """ 26 if type(files) is str: files = (files,) 27 28 if not basedir: basedir = get_basedir() 29 30 for file in files: 31 path = os.path.join(get_basedir(), "treebank", file) 32 f = open(path).read() 33 34 for sent in tokenize.blankline(f): 35 l = [] 36 for t in tokenize.whitespace(sent): 37 if (t != '[' and t != ']'): 38 l.append(tag2tab(t)) 39 #add a blank line as sentence separator 40 l.append('\n') 41 yield l42 50 #s += ''.join(sent) 51 #print >>f, s 52 #f.close() 53 54 55 56 if __name__ == '__main__': 57 demo() 58
Home | Trees | Indices | Help |
|
---|
Generated by Epydoc 3.0beta1 on Wed May 16 22:47:42 2007 | http://epydoc.sourceforge.net |