Package nltk_lite :: Package stem :: Module porter
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.stem.porter

  1  #!/usr/bin/env python 
  2  #  
  3  # Copyright (c) 2002 Vivake Gupta (vivakeATomniscia.org).  All rights reserved. 
  4  #  
  5  # This program is free software; you can redistribute it and/or 
  6  # modify it under the terms of the GNU General Public License as 
  7  # published by the Free Software Foundation; either version 2 of the 
  8  # License, or (at your option) any later version. 
  9  # 
 10  # This program is distributed in the hope that it will be useful, 
 11  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 12  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 13  # GNU General Public License for more details. 
 14  # 
 15  # You should have received a copy of the GNU General Public License 
 16  # along with this program; if not, write to the Free Software 
 17  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 
 18  # USA 
 19  # 
 20  # This software is maintained by Vivake (vivakeATomniscia.org) and is available at: 
 21  #     http://www.omniscia.org/~vivake/python/PorterStemmer.py 
 22  # 
 23  # Additional modifications were made to incorporate this module into 
 24  # nltk.  All such modifications are marked with "--NLTK--".  The nltk 
 25  # version of this module is maintained by the NLTK development staff, 
 26  # and is available from the NLTK webpage: 
 27  #     <http://nltk.sourceforge.net> 
 28   
 29  """Porter Stemming Algorithm 
 30   
 31  This is the Porter stemming algorithm, ported to Python from the 
 32  version coded up in ANSI C by the author. It follows the algorithm 
 33  presented in 
 34   
 35  Porter, M. "An algorithm for suffix stripping." Program 14.3 (1980): 130-137. 
 36   
 37  only differing from it at the points maked --DEPARTURE-- and --NEW-- 
 38  below. 
 39   
 40  For a more faithful version of the Porter algorithm, see 
 41   
 42      http://www.tartarus.org/~martin/PorterStemmer/ 
 43   
 44  Later additions: 
 45   
 46     June 2000 
 47      
 48     The 'l' of the 'logi' -> 'log' rule is put with the stem, so that 
 49     short stems like 'geo' 'theo' etc work like 'archaeo' 'philo' etc. 
 50   
 51     This follows a suggestion of Barry Wilkins, reasearch student at 
 52     Birmingham. 
 53   
 54   
 55     February 2000 
 56   
 57     the cvc test for not dropping final -e now looks after vc at the 
 58     beginning of a word, so are, eve, ice, ore, use keep final -e. In this 
 59     test c is any consonant, including w, x and y. This extension was 
 60     suggested by Chris Emerson. 
 61   
 62     -fully    -> -ful   treated like  -fulness -> -ful, and 
 63     -tionally -> -tion  treated like  -tional  -> -tion 
 64   
 65     both in Step 2. These were suggested by Hiranmay Ghosh, of New Delhi. 
 66   
 67     Invariants proceed, succeed, exceed. Also suggested by Hiranmay Ghosh. 
 68   
 69  Additional modifications were made to incorperate this module into 
 70  nltk.  All such modifications are marked with \"--NLTK--\".  The nltk 
 71  version of this module is maintained by the NLTK developers, and is 
 72  available from <http://nltk.sourceforge.net> 
 73  """ 
 74   
 75  ## --NLTK-- 
 76  ## Declare this module's documentation format. 
 77  __docformat__ = 'plaintext' 
 78   
 79  import sys 
 80  import re 
 81  import string 
 82   
 83  ## --NLTK-- 
 84  ## Import the nltk.stemmer module, which defines the stemmer interface 
 85  from nltk_lite.stem import * 
 86   
87 -class Porter(StemI):
88 89 ## --NLTK-- 90 ## Add a module docstring 91 """ 92 A word stemmer based on the Porter stemming algorithm. 93 94 Porter, M. \"An algorithm for suffix stripping.\" 95 Program 14.3 (1980): 130-137. 96 97 A few minor modifications have been made to Porter's basic 98 algorithm. See the source code of this module for more 99 information. 100 101 The Porter Stemmer requires that all tokens have string types. 102 """ 103
104 - def __init__(self):
105 """The main part of the stemming algorithm starts here. 106 b is a buffer holding a word to be stemmed. The letters are in b[k0], 107 b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is 108 readjusted downwards as the stemming progresses. Zero termination is 109 not in fact used in the algorithm. 110 111 Note that only lower case sequences are stemmed. Forcing to lower case 112 should be done before stem(...) is called. 113 """ 114 115 self.b = "" # buffer for word to be stemmed 116 self.k = 0 117 self.k0 = 0 118 self.j = 0 # j is a general offset into the string 119 120 ## --NEW-- 121 ## This is a table of irregular forms. It is quite short, but still 122 ## reflects the errors actually drawn to Martin Porter's attention over 123 ## a 20 year period! 124 ## 125 ## Extend it as necessary. 126 ## 127 ## The form of the table is: 128 ## { 129 ## "p1" : ["s11","s12","s13", ... ], 130 ## "p2" : ["s21","s22","s23", ... ], 131 ## ... 132 ## "pn" : ["sn1","sn2","sn3", ... ] 133 ## } 134 ## 135 ## String sij is mapped to paradigm form pi, and the main stemming 136 ## process is then bypassed. 137 138 irregular_forms = { 139 "sky" : ["sky", "skies"], 140 "die" : ["dying"], 141 "lie" : ["lying"], 142 "tie" : ["tying"], 143 "news" : ["news"], 144 "inning" : ["innings", "inning"], 145 "outing" : ["outings", "outing"], 146 "canning" : ["cannings", "canning"], 147 "howe" : ["howe"], 148 149 # --NEW-- 150 "proceed" : ["proceed"], 151 "exceed" : ["exceed"], 152 "succeed" : ["succeed"], # Hiranmay Ghosh 153 } 154 155 self.pool = {} 156 for key in irregular_forms.keys(): 157 for val in irregular_forms[key]: 158 self.pool[val] = key
159
160 - def cons(self, i):
161 """cons(i) is TRUE <=> b[i] is a consonant.""" 162 if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u': 163 return 0 164 if self.b[i] == 'y': 165 if i == self.k0: 166 return 1 167 else: 168 return (not self.cons(i - 1)) 169 return 1
170
171 - def m(self):
172 """m() measures the number of consonant sequences between k0 and j. 173 if c is a consonant sequence and v a vowel sequence, and <..> 174 indicates arbitrary presence, 175 176 <c><v> gives 0 177 <c>vc<v> gives 1 178 <c>vcvc<v> gives 2 179 <c>vcvcvc<v> gives 3 180 .... 181 """ 182 n = 0 183 i = self.k0 184 while 1: 185 if i > self.j: 186 return n 187 if not self.cons(i): 188 break 189 i = i + 1 190 i = i + 1 191 while 1: 192 while 1: 193 if i > self.j: 194 return n 195 if self.cons(i): 196 break 197 i = i + 1 198 i = i + 1 199 n = n + 1 200 while 1: 201 if i > self.j: 202 return n 203 if not self.cons(i): 204 break 205 i = i + 1 206 i = i + 1
207
208 - def vowelinstem(self):
209 """vowelinstem() is TRUE <=> k0,...j contains a vowel""" 210 for i in range(self.k0, self.j + 1): 211 if not self.cons(i): 212 return 1 213 return 0
214
215 - def doublec(self, j):
216 """doublec(j) is TRUE <=> j,(j-1) contain a double consonant.""" 217 if j < (self.k0 + 1): 218 return 0 219 if (self.b[j] != self.b[j-1]): 220 return 0 221 return self.cons(j)
222
223 - def cvc(self, i):
224 """cvc(i) is TRUE <=> 225 226 a) ( --NEW--) i == 1, and p[0] p[1] is vowel consonant, or 227 228 b) p[i - 2], p[i - 1], p[i] has the form consonant - 229 vowel - consonant and also if the second c is not w, x or y. this 230 is used when trying to restore an e at the end of a short word. 231 e.g. 232 233 cav(e), lov(e), hop(e), crim(e), but 234 snow, box, tray. 235 """ 236 if i == 0: return 0 # i == 0 never happens perhaps 237 if i == 1: return (not self.cons(0) and self.cons(1)) 238 if not self.cons(i) or self.cons(i-1) or not self.cons(i-2): return 0 239 240 ch = self.b[i] 241 if ch == 'w' or ch == 'x' or ch == 'y': 242 return 0 243 244 return 1
245
246 - def ends(self, s):
247 """ends(s) is TRUE <=> k0,...k ends with the string s.""" 248 length = len(s) 249 if s[length - 1] != self.b[self.k]: # tiny speed-up 250 return 0 251 if length > (self.k - self.k0 + 1): 252 return 0 253 if self.b[self.k-length+1:self.k+1] != s: 254 return 0 255 self.j = self.k - length 256 return 1
257
258 - def setto(self, s):
259 """setto(s) sets (j+1),...k to the characters in the string s, readjusting k.""" 260 length = len(s) 261 self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:] 262 self.k = self.j + length
263
264 - def r(self, s):
265 """r(s) is used further down.""" 266 if self.m() > 0: 267 self.setto(s)
268
269 - def step1ab(self):
270 """step1ab() gets rid of plurals and -ed or -ing. e.g. 271 272 caresses -> caress 273 ponies -> poni 274 sties -> sti 275 tie -> tie (--NEW--: see below) 276 caress -> caress 277 cats -> cat 278 279 feed -> feed 280 agreed -> agree 281 disabled -> disable 282 283 matting -> mat 284 mating -> mate 285 meeting -> meet 286 milling -> mill 287 messing -> mess 288 289 meetings -> meet 290 """ 291 if self.b[self.k] == 's': 292 if self.ends("sses"): 293 self.k = self.k - 2 294 elif self.ends("ies"): 295 if self.j == 0: 296 self.k = self.k - 1 297 # this line extends the original algorithm, so that 298 # 'flies'->'fli' but 'dies'->'die' etc 299 else: 300 self.k = self.k - 2 301 elif self.b[self.k - 1] != 's': 302 self.k = self.k - 1 303 304 if self.ends("ied"): 305 if self.j == 0: 306 self.k = self.k - 1 307 else: 308 self.k = self.k - 2 309 # this line extends the original algorithm, so that 310 # 'spied'->'spi' but 'died'->'die' etc 311 312 elif self.ends("eed"): 313 if self.m() > 0: 314 self.k = self.k - 1 315 elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem(): 316 self.k = self.j 317 if self.ends("at"): self.setto("ate") 318 elif self.ends("bl"): self.setto("ble") 319 elif self.ends("iz"): self.setto("ize") 320 elif self.doublec(self.k): 321 self.k = self.k - 1 322 ch = self.b[self.k] 323 if ch == 'l' or ch == 's' or ch == 'z': 324 self.k = self.k + 1 325 elif (self.m() == 1 and self.cvc(self.k)): 326 self.setto("e")
327
328 - def step1c(self):
329 """step1c() turns terminal y to i when there is another vowel in the stem. 330 --NEW--: This has been modified from the original Porter algorithm so that y->i 331 is only done when y is preceded by a consonant, but not if the stem 332 is only a single consonant, i.e. 333 334 (*c and not c) Y -> I 335 336 So 'happy' -> 'happi', but 337 'enjoy' -> 'enjoy' etc 338 339 This is a much better rule. Formerly 'enjoy'->'enjoi' and 'enjoyment'-> 340 'enjoy'. Step 1c is perhaps done too soon; but with this modification that 341 no longer really matters. 342 343 Also, the removal of the vowelinstem(z) condition means that 'spy', 'fly', 344 'try' ... stem to 'spi', 'fli', 'tri' and conflate with 'spied', 'tried', 345 'flies' ... 346 """ 347 if self.ends("y") and self.j > 0 and self.cons(self.k - 1): 348 self.b = self.b[:self.k] + 'i' + self.b[self.k+1:]
349
350 - def step2(self):
351 """step2() maps double suffices to single ones. 352 so -ization ( = -ize plus -ation) maps to -ize etc. note that the 353 string before the suffix must give m() > 0. 354 """ 355 if self.b[self.k - 1] == 'a': 356 if self.ends("ational"): self.r("ate") 357 elif self.ends("tional"): self.r("tion") 358 elif self.b[self.k - 1] == 'c': 359 if self.ends("enci"): self.r("ence") 360 elif self.ends("anci"): self.r("ance") 361 elif self.b[self.k - 1] == 'e': 362 if self.ends("izer"): self.r("ize") 363 elif self.b[self.k - 1] == 'l': 364 if self.ends("bli"): self.r("ble") # --DEPARTURE-- 365 # To match the published algorithm, replace this phrase with 366 # if self.ends("abli"): self.r("able") 367 elif self.ends("alli"): 368 if self.m() > 0: # --NEW-- 369 self.setto("al") 370 self.step2() 371 elif self.ends("fulli"): self.r("ful") # --NEW-- 372 elif self.ends("entli"): self.r("ent") 373 elif self.ends("eli"): self.r("e") 374 elif self.ends("ousli"): self.r("ous") 375 elif self.b[self.k - 1] == 'o': 376 if self.ends("ization"): self.r("ize") 377 elif self.ends("ation"): self.r("ate") 378 elif self.ends("ator"): self.r("ate") 379 elif self.b[self.k - 1] == 's': 380 if self.ends("alism"): self.r("al") 381 elif self.ends("iveness"): self.r("ive") 382 elif self.ends("fulness"): self.r("ful") 383 elif self.ends("ousness"): self.r("ous") 384 elif self.b[self.k - 1] == 't': 385 if self.ends("aliti"): self.r("al") 386 elif self.ends("iviti"): self.r("ive") 387 elif self.ends("biliti"): self.r("ble") 388 elif self.b[self.k - 1] == 'g': # --DEPARTURE-- 389 if self.ends("logi"): 390 self.j = self.j + 1 # --NEW-- (Barry Wilkins) 391 self.r("og")
392 # To match the published algorithm, delete this phrase 393
394 - def step3(self):
395 """step3() dels with -ic-, -full, -ness etc. similar strategy to step2.""" 396 if self.b[self.k] == 'e': 397 if self.ends("icate"): self.r("ic") 398 elif self.ends("ative"): self.r("") 399 elif self.ends("alize"): self.r("al") 400 elif self.b[self.k] == 'i': 401 if self.ends("iciti"): self.r("ic") 402 elif self.b[self.k] == 'l': 403 if self.ends("ical"): self.r("ic") 404 elif self.ends("ful"): self.r("") 405 elif self.b[self.k] == 's': 406 if self.ends("ness"): self.r("")
407
408 - def step4(self):
409 """step4() takes off -ant, -ence etc., in context <c>vcvc<v>.""" 410 if self.b[self.k - 1] == 'a': 411 if self.ends("al"): pass 412 else: return 413 elif self.b[self.k - 1] == 'c': 414 if self.ends("ance"): pass 415 elif self.ends("ence"): pass 416 else: return 417 elif self.b[self.k - 1] == 'e': 418 if self.ends("er"): pass 419 else: return 420 elif self.b[self.k - 1] == 'i': 421 if self.ends("ic"): pass 422 else: return 423 elif self.b[self.k - 1] == 'l': 424 if self.ends("able"): pass 425 elif self.ends("ible"): pass 426 else: return 427 elif self.b[self.k - 1] == 'n': 428 if self.ends("ant"): pass 429 elif self.ends("ement"): pass 430 elif self.ends("ment"): pass 431 elif self.ends("ent"): pass 432 else: return 433 elif self.b[self.k - 1] == 'o': 434 if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass 435 elif self.ends("ou"): pass 436 # takes care of -ous 437 else: return 438 elif self.b[self.k - 1] == 's': 439 if self.ends("ism"): pass 440 else: return 441 elif self.b[self.k - 1] == 't': 442 if self.ends("ate"): pass 443 elif self.ends("iti"): pass 444 else: return 445 elif self.b[self.k - 1] == 'u': 446 if self.ends("ous"): pass 447 else: return 448 elif self.b[self.k - 1] == 'v': 449 if self.ends("ive"): pass 450 else: return 451 elif self.b[self.k - 1] == 'z': 452 if self.ends("ize"): pass 453 else: return 454 else: 455 return 456 if self.m() > 1: 457 self.k = self.j
458
459 - def step5(self):
460 """step5() removes a final -e if m() > 1, and changes -ll to -l if 461 m() > 1. 462 """ 463 self.j = self.k 464 if self.b[self.k] == 'e': 465 a = self.m() 466 if a > 1 or (a == 1 and not self.cvc(self.k-1)): 467 self.k = self.k - 1 468 if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1: 469 self.k = self.k -1
470
471 - def stem_word(self, p, i=0, j=None):
472 """In stem(p,i,j), p is a char pointer, and the string to be stemmed 473 is from p[i] to p[j] inclusive. Typically i is zero and j is the 474 offset to the last character of a string, (p[j+1] == '\0'). The 475 stemmer adjusts the characters p[i] ... p[j] and returns the new 476 end-point of the string, k. Stemming never increases word length, so 477 i <= k <= j. To turn the stemmer into a module, declare 'stem' as 478 extern, and delete the remainder of this file. 479 """ 480 ## --NLTK-- 481 ## Don't print results as we go (commented out the next line) 482 #print p[i:j+1] 483 if j == None: 484 j = len(p) - 1 485 486 # copy the parameters into statics 487 self.b = p 488 self.k = j 489 self.k0 = i 490 491 if self.pool.has_key(self.b[self.k0:self.k+1]): 492 return self.pool[self.b[self.k0:self.k+1]] 493 494 if self.k <= self.k0 + 1: 495 return self.b # --DEPARTURE-- 496 497 # With this line, strings of length 1 or 2 don't go through the 498 # stemming process, although no mention is made of this in the 499 # published algorithm. Remove the line to match the published 500 # algorithm. 501 502 self.step1ab() 503 self.step1c() 504 self.step2() 505 self.step3() 506 self.step4() 507 self.step5() 508 return self.b[self.k0:self.k+1]
509
510 - def adjust_case(self, word, stem):
511 lower = string.lower(word) 512 513 ret = "" 514 515 for x in xrange(len(stem)): 516 if lower[x] == stem[x]: 517 ret = ret + word[x] 518 else: 519 ret = ret + stem[x] 520 521 return ret
522 523 ## --NLTK-- 524 ## Don't use this procedure; we want to work with individual 525 ## tokens, instead. (commented out the following procedure) 526 #def stem(self, text): 527 # parts = re.split("(\W+)", text) 528 # numWords = (len(parts) + 1)/2 529 # 530 # ret = "" 531 # for i in xrange(numWords): 532 # word = parts[2 * i] 533 # separator = "" 534 # if ((2 * i) + 1) < len(parts): 535 # separator = parts[(2 * i) + 1] 536 # 537 # stem = self.stem_word(string.lower(word), 0, len(word) - 1) 538 # ret = ret + self.adjust_case(word, stem) 539 # ret = ret + separator 540 # return ret 541 542 ## --NLTK-- 543 ## Define a stem() method that implements the StemmerI interface.
544 - def stem(self, word):
545 stem = self.stem_word(string.lower(word), 0, len(word) - 1) 546 return self.adjust_case(word, stem)
547 548 ## --NLTK-- 549 ## Add a string representation function
550 - def __repr__(self):
551 return '<Porter Stemmer>'
552 553 ## --NLTK-- 554 ## This test procedure isn't applicable. 555 #if __name__ == '__main__': 556 # p = Porter() 557 # if len(sys.argv) > 1: 558 # for f in sys.argv[1:]: 559 # infile = open(f, 'r') 560 # while 1: 561 # w = infile.readline() 562 # if w == '': 563 # break 564 # w = w[:-1] 565 # print p.stem(w) 566 567 ##--NLTK-- 568 ## Added a demo() function 569
570 -def demo():
571 """ 572 A demonstration of the porter stemmer on a sample from 573 the Penn Treebank corpus. 574 """ 575 576 from nltk_lite.corpora import treebank 577 from nltk_lite import stem 578 579 stemmer = stem.Porter() 580 581 i = 0 582 orig = [] 583 stemmed = [] 584 for sent in treebank.raw(): 585 for word in sent: 586 orig.append(word) 587 sword = stemmer.stem(word) 588 stemmed.append(sword) 589 i+=1 590 if i>3: break 591 592 # Convert the results to a string, and word-wrap them. 593 results = string.join(stemmed) 594 results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip() 595 596 # Convert the original to a string, and word wrap it. 597 original = string.join(orig) 598 original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip() 599 600 # Print the results. 601 print '-Original-'.center(70).replace(' ', '*').replace('-', ' ') 602 print original 603 print '-Results-'.center(70).replace(' ', '*').replace('-', ' ') 604 print results 605 print '*'*70
606 607 ##--NLTK-- 608 ## Call demo() if we're invoked directly. 609 if __name__ == '__main__': demo() 610