Package nltk_lite :: Package wordnet :: Module brown_ic
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.wordnet.brown_ic

  1   
  2  import pickle 
  3  import sys 
  4   
  5  from itertools import islice 
  6   
  7  from nltk_lite.corpora import brown 
  8  from nltk_lite.probability import * 
  9  from nltk_lite.tokenize import * 
 10  from nltk_lite.wordnet import * 
 11   
12 -def substr_binary_search(item, list):
13 14 if not list: 15 return None 16 17 low = 0 18 high = len(list) - 1 19 mid = high / 2 20 21 while list[mid].find(item) < 0: 22 23 if mid >= high or mid <= low: return False 24 25 elif list[mid] > item: 26 high = mid 27 mid -= (high - low) / 2 28 29 elif list[mid] < item: 30 low = mid 31 mid += (high - low) / 2 32 33 return list[mid].split(':')[1]
34
35 -def generate_compound_list(filename=None):
36 37 dictionaries = [N, V] 38 compound_types = ['nc', 'vbc'] 39 40 if filename: outfile = open(filename, 'w') 41 else: outfile = sys.stdout 42 43 for dict, type in zip(dictionaries, compound_types): 44 45 for term in dict: 46 term = term.form 47 if ' ' in term: outfile.write("%s:%s\n" % (term, type))
48
49 -def read_word_list(filename):
50 51 word_list = [] 52 infile = open(filename, 'r') 53 54 for line in infile: 55 line = line.rstrip() 56 word_list.append(line) 57 58 word_list.sort() 59 60 return word_list
61
62 -def get_roots(dictionary):
63 roots = [] 64 for word in dictionary: 65 for sense in word: 66 synset = sense.synset 67 hypernyms = set(synset[HYPERNYM]) | set(synset[INSTANCE_HYPERNYM]) 68 if len(hypernyms) == 0: roots.append(synset) 69 return roots
70
71 -def propogate_frequencies(freq_dist, synset):
72 73 hyponyms = set(synset[HYPONYM]) | set(synset[INSTANCE_HYPONYM]) 74 for hyponym in hyponyms: 75 freq_dist.inc(node, propogate_frequencies(freq_dist, hyponym)) 76 return freq_dist.count(node)
77
78 -def brown_information_content(output_filename, compounds_filename, \ 79 stopwords_filename=None, smoothing=True):
80 81 # A list of all the noun and verb parts of speech as recorded in the 82 # Brown corpus. Currently many are ignored because the Wordnet morphy() 83 # method, which derives the base form of inflected words, can't handle 84 # contractions, genetive forms etc. 85 86 noun_tags = [ 87 'nn', # N. sing. common (burden) 88 # 'nn$', # N. sing. common, genetive (company's) 89 # 'nn+bez', # N. sing. common + 'to be' pres. 3rd pers sing. (wife's) 90 # 'nn+hvd', # N. sing. common + 'to have' past (James'd) 91 # 'nn+hvz', # N. sing. common + 'to have' pres. 3rd pers sing. (guy's) 92 # 'nn+in', # N. sing. common + prep. (buncha) 93 # 'nn+md', # N. sing. common + modal aux. (sun'll) 94 # 'nn+nn', # N. sing. common, hyphenated pair (stomach-belly) 95 'nns', # N. plu. common (burdens) 96 # 'nns$', # N. plu. common genetive (companies') 97 # 'nns+md', # N. plu. common + modal aux. (cowboys'll) 98 'np', # N. sing. proper (September) 99 # 'np$', # N. sing. proper genetive (William's) 100 # 'np+bez', # N. sing. proper + 'to be' pres. 3rd pers sing. (Rob's) 101 # 'np+hvd', # N. sing. proper + 'to have' past (James'd) 102 # 'np+hvz', # N. sing. proper + 'to have' pres. 3rd pers sing. (Bill's) 103 # 'np+md', # N. sing. proper + modal aux. (John'll) 104 'nps', # N. plu. proper (Catholics) 105 # 'nps$', # N. plu. proper, genetive (Republicans') 106 'nr', # N. sing. adverbial (today, Saturday, East) 107 # 'nr$', # N. sing. adverbial, genetive (Saturday's) 108 # 'nr+md' # N. sing. adverbial + modal aux. (today'll) 109 'nrs', # N. plu. adverbial (Sundays) 110 'nc', # N. compound (jigsaw puzzle, egg cup) 111 ] 112 113 verb_tags = [ 114 'vb', # V. base: pres. imp. or inf. (find, act) 115 # 'vb+at', # V. base: pres. or inf. + article (wanna) 116 # 'vb+in', # V. base: pres. imp. or inf. + prep. (lookit) 117 # 'vb+jj', # V. base: pres. imp. or inf. + adj. (die-dead) 118 # 'vb+ppo', # V. pres. + pronoun, personal, acc. (let's) 119 # 'vb+rp', # V. imperative + adverbial particle (g'ahn, c'mon) 120 # 'vb+vb', # V. base: pres. imp. or inf. hyphenated pair (say-speak) 121 'vbd', # V. past (said, produced) 122 'vbg', # V. pres. part./gerund (keeping, attending) 123 # 'vbg+to', # V. pres part. + infinitival to (gonna) 124 'vbn', # V. past part. (conducted, adopted) 125 # 'vbn+to', # V. past part. + infinitival to (gotta) 126 'vbz', # V. pres. 3rd pers. sing. (deserves, seeks) 127 'vbc' # V. compound (band together, run over) 128 ] 129 130 outfile = open(output_filename, "wb") 131 132 if compounds_filename: 133 compounds = read_word_list(compounds_filename) 134 else: 135 compounds = [] 136 137 if stopwords_filename: 138 stopwords = read_word_list(stopword_filename) 139 else: 140 stopwords = [] 141 142 noun_fd = FreqDist() 143 verb_fd = FreqDist() 144 145 count = 0 146 increment = 10000 147 148 sys.stdout.write("Building initial frequency distributions") 149 150 for sentence in brown.tagged(): 151 152 if len(sentence) == 0: 153 continue 154 155 # Greedily search for compound nouns/verbs. The search is naive and 156 # doesn't account for inflected words within the compound (so 157 # variant forms of the compound will not be identified e.g. the 158 # compound 'abdominal cavities' will not be recognised as the plural of 159 # 'abdominal cavity'); this is in keeping with the original Perl 160 # implementation. Rectifying this is mildy tricky in that some compound 161 # constituents are expected to be inflected e.g. 'abandoned ship' so 162 # it isn't possible to simply uninflect each constituent before 163 # searching; rather, a list of variant compounds containing all possible 164 # inflected/uninflected constituents would be needed (compounds rarely 165 # exceed length four so the quadratic search space wouldn't be too scary). 166 167 new_sentence = [] 168 compound = sentence.pop(0) 169 170 # Pop (word token, PoS tag) tuples from the sentence until all words 171 # have been consumed. Glue the word tokens together while they form 172 # a substring of a valid compound. When adding a new token makes the 173 # compound invalid, append the current compound onto the new sentence 174 # queue and assign the new (token, tag) tuple as the current compound 175 # base. 176 177 while len(sentence) > 0: 178 179 (token, tag) = sentence.pop(0) 180 181 # Convert all tokens to lowercase 182 token = token.lower() 183 184 # Add this token to the current compound string, creating a 185 # candidate compound token that may or may not exist in Wordnet. 186 compound_token = compound[0] + ' ' + token 187 188 # Perform a binary search through the list of all compounds. The 189 # search necessarily accepts partial matches. The search returns 190 # the compound type ('nc' for noun compound or 'vbc' for verb 191 # compound) of the matched compound, or False if no match was 192 # found. Recording the compound type is necessary so that the 193 # correct frequency distribution can be updated later. 194 195 compound_tag = substr_binary_search(compound_token, compounds) 196 197 if compound_tag: 198 compound = (compound_token, compound_tag) 199 200 # If appending the new token to the current compound results in 201 # an invalid compound, append the current compound to the new 202 # sentence queue and reset it, placing the new token as the 203 # beginning of a (possible) new compound. 204 205 else: 206 new_sentence.append(compound) 207 compound = (token, tag) 208 209 # The final (possibly compound) token in each sentence needs to be 210 # manually appended onto the new sentence list. 211 212 new_sentence.append(compound) 213 214 for (token, tag) in new_sentence: 215 216 # Give the user some feedback to let him or her know the 217 # distributions are still being built. The initial stage can take 218 # quite some time (half an hour or more). 219 220 count += 1 221 222 if count % increment == 0: 223 sys.stdout.write('.') 224 225 # Basic splitting based on the word token's POS. Later this could 226 # be further developed using the additional (now commented out) 227 # tag types and adding conditional checks to turn e.g. "you'll" 228 # into "you" + "will". This would increase the accuracy of the 229 # distribution, as currently all such contractions are discarded 230 # (because they won't match any entries in the dictionary). 231 232 if tag in noun_tags: 233 pos = "noun" 234 dictionary = N 235 freq_dist = noun_fd 236 237 elif tag in verb_tags: 238 pos = "verb" 239 dictionary = V 240 freq_dist = verb_fd 241 242 else: token = None 243 244 # If the word form is inflected e.g. plural, retrieve its base 245 # or uninflected form. 246 247 if token is not None: 248 249 if dictionary.has_key(token): uninflected_token = token 250 else: uninflected_token = morphy(token, pos) 251 252 else: uninflected_token = None 253 254 # Increment the count for each sense of the word token, there 255 # being no practical way to distinguish between word senses in the 256 # Brown corpus (SemCor would be another story). 257 258 if uninflected_token: 259 for synset in dictionary[uninflected_token]: 260 freq_dist.inc(synset) 261 262 # If smoothing is True perform Laplacian smoothing i.e. add 1 to each 263 # synset's frequency count. 264 265 if smoothing: 266 267 for sample in noun_fd.samples(): 268 noun_fd.inc(sample) 269 270 for sample in verb_fd.samples(): 271 verb_fd.inc(sample) 272 273 # Propogate the frequency counts up the taxonomy structure. Thus the 274 # root node (or nodes) will have a frequency equal to the sum of all 275 # of their descendent node frequencies (plus a bit extra, if the root 276 # node appeared in the source text). The distribution will then adhere 277 # to the IR principle that nodes (synsets) that appear less frequently 278 # have a higher information content. 279 280 sys.stdout.write(" done.\n") 281 sys.stdout.write("Finding taxonomy roots...") 282 283 noun_roots = get_roots(N) 284 verb_roots = get_roots(V) 285 286 sys.stdout.write(" done.\n") 287 sys.stdout.write("Propogating frequencies up the taxonomy trees...") 288 289 for root in noun_roots: 290 propogate_frequencies(noun_fd, root) 291 292 for root in verb_roots: 293 propogate_frequencies(verb_fd, root) 294 295 sys.stdout.write(" done.\n") 296 297 # Output the frequency distributions to a file. Rather than pickle the 298 # frequency distributions, and the synsets contained therein, output 299 # a dict of synset identifiers and probabilities. This results in a file 300 # which is a great deal smaller than the pickled FreqDist object file. 301 302 sys.stdout.write("Converting to synset/sample count dictionaries...") 303 304 noun_dict = {} 305 verb_dict = {} 306 307 # The probabilities are not calculated as is normal for a frequency 308 # distribution (i.e. sample count / sum of all sample counts). Instead 309 # they are (sample count / sample count of root node), because of the 310 # propagation step that was performed earlier; this has the desirable 311 # consequence that root nodes have a probability of 1 and an 312 # Information Content (IC) score of 0. 313 314 root = N['entity'][0].synset 315 316 for sample in noun_fd.samples(): 317 noun_dict[sample.offset] = (noun_fd.count(sample), noun_fd.count(root)) 318 319 for sample in verb_fd.samples(): 320 root = sample.hypernym_paths()[0][0] 321 verb_dict[sample.offset] = (verb_fd.count(sample), verb_fd.count(root)) 322 323 sys.stdout.write(" done.\n") 324 sys.stdout.write("Writing probability hashes to file %s..." % (output_filename)) 325 326 pickle.dump(noun_dict, outfile) 327 pickle.dump(verb_dict, outfile) 328 329 sys.stdout.write(" done.\n") 330 331 outfile.close()
332 333 brown_information_content('brown_ic.dat', None) 334