Package nltk_lite :: Module probability
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.probability

   1  # Natural Language Toolkit: Probability and Statistics 
   2  # 
   3  # Copyright (C) 2001-2007 University of Pennsylvania 
   4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
   5  #         Steven Bird <sb@csse.unimelb.edu.au> (additions) 
   6  #         Trevor Cohn <tacohn@cs.mu.oz.au> (additions) 
   7  # URL: <http://nltk.sf.net> 
   8  # For license information, see LICENSE.TXT 
   9  # 
  10  # $Id: probability.py 4525 2007-05-16 10:03:01Z stevenbird $ 
  11   
  12  _NINF = float('-1e300') 
  13   
  14  """ 
  15  Classes for representing and processing probabilistic information. 
  16   
  17  The L{FreqDist} class is used to encode X{frequency distributions}, 
  18  which count the number of times that each outcome of an experiment 
  19  occurs. 
  20   
  21  The L{ProbDistI} class defines a standard interface for X{probability 
  22  distributions}, which encode the probability of each outcome for an 
  23  experiment.  There are two types of probability distribution: 
  24   
  25    - X{derived probability distributions} are created from frequency 
  26      distributions.  They attempt to model the probability distribution 
  27      that generated the frequency distribution. 
  28    - X{analytic probability distributions} are created directly from 
  29      parameters (such as variance). 
  30   
  31  The L{ConditionalFreqDist} class and L{ConditionalProbDistI} interface 
  32  are used to encode conditional distributions.  Conditional probability 
  33  distributions can be derived or analytic; but currently the only 
  34  implementation of the C{ConditionalProbDistI} interface is 
  35  L{ConditionalProbDist}, a derived distribution. 
  36   
  37  """ 
  38   
  39  import types, math 
  40   
  41  ##////////////////////////////////////////////////////// 
  42  ##  Frequency Distributions 
  43  ##////////////////////////////////////////////////////// 
  44   
45 -class FreqDist(object):
46 """ 47 A frequency distribution for the outcomes of an experiment. A 48 frequency distribution records the number of times each outcome of 49 an experiment has occured. For example, a frequency distribution 50 could be used to record the frequency of each word type in a 51 document. Formally, a frequency distribution can be defined as a 52 function mapping from each sample to the number of times that 53 sample occured as an outcome. 54 55 Frequency distributions are generally constructed by running a 56 number of experiments, and incrementing the count for a sample 57 every time it is an outcome of an experiment. For example, the 58 following code will produce a frequency distribution that encodes 59 how often each word occurs in a text: 60 61 >>> fdist = FreqDist() 62 >>> for word in tokenize.whitespace(sent): 63 ... fdist.inc(word) 64 """
65 - def __init__(self):
66 """ 67 Construct a new empty, C{FreqDist}. In particular, the count 68 for every sample is zero. 69 """ 70 self._count = {} 71 self._N = 0 72 self._Nr_cache = None 73 self._max_cache = None
74
75 - def inc(self, sample, count=1):
76 """ 77 Increment this C{FreqDist}'s count for the given 78 sample. 79 80 @param sample: The sample whose count should be incremented. 81 @type sample: any 82 @param count: The amount to increment the sample's count by. 83 @type count: C{int} 84 @rtype: None 85 @raise NotImplementedError: If C{sample} is not a 86 supported sample type. 87 """ 88 if count == 0: return 89 90 self._N += count 91 self._count[sample] = self._count.get(sample,0) + count 92 93 # Invalidate the Nr cache and max cache. 94 self._Nr_cache = None 95 self._max_cache = None
96
97 - def N(self):
98 """ 99 @return: The total number of sample outcomes that have been 100 recorded by this C{FreqDist}. For the number of unique 101 sample values (or bins) with counts greater than zero, use 102 C{FreqDist.B()}. 103 @rtype: C{int} 104 """ 105 return self._N
106
107 - def B(self):
108 """ 109 @return: The total number of sample values (or X{bins}) that 110 have counts greater than zero. For the total 111 number of sample outcomes recorded, use C{FreqDist.N()}. 112 @rtype: C{int} 113 """ 114 return len(self._count)
115
116 - def samples(self):
117 """ 118 @return: A list of all samples that have been recorded as 119 outcomes by this frequency distribution. Use C{count()} 120 to determine the count for each sample. 121 @rtype: C{list} 122 """ 123 return self._count.keys()
124
125 - def Nr(self, r, bins=None):
126 """ 127 @return: The number of samples with count r. 128 @rtype: C{int} 129 @type r: C{int} 130 @param r: A sample count. 131 @type bins: C{int} 132 @param bins: The number of possible sample outcomes. C{bins} 133 is used to calculate Nr(0). In particular, Nr(0) is 134 C{bins-self.B()}. If C{bins} is not specified, it 135 defaults to C{self.B()} (so Nr(0) will be 0). 136 """ 137 if r < 0: raise IndexError, 'FreqDist.Nr(): r must be non-negative' 138 139 # Special case for Nr(0): 140 if r == 0: 141 if bins is None: return 0 142 else: return bins-self.B() 143 144 # We have to search the entire distribution to find Nr. Since 145 # this is an expensive operation, and is likely to be used 146 # repeatedly, cache the results. 147 if self._Nr_cache is None: 148 self._cache_Nr_values() 149 150 if r >= len(self._Nr_cache): return 0 151 return self._Nr_cache[r]
152
153 - def _cache_Nr_values(self):
154 Nr = [0] 155 for sample in self.samples(): 156 c = self._count.get(sample, 0) 157 if c >= len(Nr): 158 Nr += [0]*(c+1-len(Nr)) 159 Nr[c] += 1 160 self._Nr_cache = Nr
161
162 - def count(self, sample):
163 """ 164 Return the count of a given sample. The count of a sample is 165 defined as the number of times that sample outcome was 166 recorded by this C{FreqDist}. Counts are non-negative 167 integers. 168 169 @return: The count of a given sample. 170 @rtype: C{int} 171 @param sample: the sample whose count 172 should be returned. 173 @type sample: any. 174 """ 175 return self._count.get(sample, 0)
176
177 - def freq(self, sample):
178 """ 179 Return the frequency of a given sample. The frequency of a 180 sample is defined as the count of that sample divided by the 181 total number of sample outcomes that have been recorded by 182 this C{FreqDist}. The count of a sample is defined as the 183 number of times that sample outcome was recorded by this 184 C{FreqDist}. Frequencies are always real numbers in the range 185 [0, 1]. 186 187 @return: The frequency of a given sample. 188 @rtype: float 189 @param sample: the sample whose frequency 190 should be returned. 191 @type sample: any 192 """ 193 if self._N is 0: return 0 194 return float(self._count.get(sample, 0)) / self._N
195
196 - def max(self):
197 """ 198 Return the sample with the greatest number of outcomes in this 199 frequency distribution. If two or more samples have the same 200 number of outcomes, return one of them; which sample is 201 returned is undefined. If no outcomes have occured in this 202 frequency distribution, return C{None}. 203 204 @return: The sample with the maximum number of outcomes in this 205 frequency distribution. 206 @rtype: any or C{None} 207 """ 208 if self._max_cache is None: 209 best_sample = None 210 best_count = -1 211 for sample in self._count.keys(): 212 if self._count[sample] > best_count: 213 best_sample = sample 214 best_count = self._count[sample] 215 self._max_cache = best_sample 216 return self._max_cache
217
218 - def sorted_samples(self):
219 """ 220 Return the samples sorted in decreasing order of frequency. Instances 221 with the same count will be arbitrarily ordered. Instances with a 222 count of zero will be omitted. This method is C{O(N^2)}, where C{N} is 223 the number of samples, but will complete in a shorter time on average. 224 225 @return: The set of samples in sorted order. 226 @rtype: sequence of any 227 """ 228 items = [(-count,sample) for (sample,count) in self._count.items()] 229 items.sort() 230 return [sample for (neg_count,sample) in items]
231
232 - def __repr__(self):
233 """ 234 @return: A string representation of this C{FreqDist}. 235 @rtype: string 236 """ 237 return '<FreqDist with %d samples>' % self.N()
238
239 - def __str__(self):
240 """ 241 @return: A string representation of this C{FreqDist}. 242 @rtype: string 243 """ 244 samples = self.sorted_samples() 245 items = ['%r: %r' % (s, self._count[s]) for s in samples] 246 return '<FreqDist: %s>' % ', '.join(items)
247
248 - def __contains__(self, sample):
249 """ 250 @return: True if the given sample occurs one or more times in 251 this frequency distribution. 252 @rtype: C{boolean} 253 @param sample: The sample to search for. 254 @type sample: any 255 """ 256 return self._count.has_key(sample)
257 258 ##////////////////////////////////////////////////////// 259 ## Probability Distributions 260 ##////////////////////////////////////////////////////// 261
262 -class ProbDistI(object):
263 """ 264 A probability distribution for the outcomes of an experiment. A 265 probability distribution specifies how likely it is that an 266 experiment will have any given outcome. For example, a 267 probability distribution could be used to predict the probability 268 that a token in a document will have a given type. Formally, a 269 probability distribution can be defined as a function mapping from 270 samples to nonnegative real numbers, such that the sum of every 271 number in the function's range is 1.0. C{ProbDist}s are often 272 used to model the probability distribution of the experiment used 273 to generate a frequency distribution. 274 """
275 - def __init__(self):
276 if self.__class__ == ProbDistI: 277 raise AssertionError, "Interfaces can't be instantiated"
278
279 - def prob(self, sample):
280 """ 281 @return: the probability for a given sample. Probabilities 282 are always real numbers in the range [0, 1]. 283 @rtype: float 284 @param sample: The sample whose probability 285 should be returned. 286 @type sample: any 287 """ 288 raise AssertionError()
289
290 - def logprob(self, sample):
291 """ 292 @return: the natural logarithm of the probability for a given 293 sample. Log probabilities range from negitive infinity to 294 zero. 295 @rtype: float 296 @param sample: The sample whose probability 297 should be returned. 298 @type sample: any 299 """ 300 # Default definition, in terms of prob() 301 p = self.prob(sample) 302 if p == 0: 303 # Use some approximation to infinity. What this does 304 # depends on your system's float implementation. 305 return _NINF 306 else: 307 return math.log(p)
308
309 - def max(self):
310 """ 311 @return: the sample with the greatest probability. If two or 312 more samples have the same probability, return one of them; 313 which sample is returned is undefined. 314 @rtype: any 315 """ 316 raise AssertionError()
317
318 - def samples(self):
319 """ 320 @return: A list of all samples that have nonzero 321 probabilities. Use C{prob} to find the probability of 322 each sample. 323 @rtype: C{list} 324 """ 325 raise AssertionError()
326
327 -class UniformProbDist(ProbDistI):
328 """ 329 A probability distribution that assigns equal probability to each 330 sample in a given set; and a zero probability to all other 331 samples. 332 """
333 - def __init__(self, samples):
334 """ 335 Construct a new uniform probability distribution, that assigns 336 equal probability to each sample in C{samples}. 337 338 @param samples: The samples that should be given uniform 339 probability. 340 @type samples: C{list} 341 @raise ValueError: If C{samples} is empty. 342 """ 343 if len(samples) == 0: 344 raise ValueError('A Uniform probability distribution must '+ 345 'have at least one sample.') 346 self._sampleset = set(samples) 347 self._prob = 1.0/len(self._sampleset) 348 self._samples = list(self._sampleset)
349
350 - def prob(self, sample):
351 if sample in self._sampleset: return self._prob 352 else: return 0
353 - def max(self): return self._samples[0]
354 - def samples(self): return self._samples
355 - def __repr__(self):
356 return '<UniformProbDist with %d samples>' % len(self._sampleset)
357
358 -class DictionaryProbDist(ProbDistI):
359 """ 360 A probability distribution whose probabilities are directly 361 specified by a given dictionary. The given dictionary maps 362 samples to probabilities. 363 """
364 - def __init__(self, prob_dict=None, log=False, normalize=False):
365 """ 366 Construct a new probability distribution from the given 367 dictionary, which maps values to probabilities (or to log 368 probabilities, if C{log} is true). If C{normalize} is 369 true, then the probability values are scaled by a constant 370 factor such that they sum to 1. 371 """ 372 self._prob_dict = prob_dict.copy() 373 self._log = log 374 375 # Normalize the distribution, if requested. 376 if normalize: 377 if log: 378 value_sum = sum_logs(self._prob_dict.values()) 379 if value_sum <= _NINF: 380 logp = math.log(1.0/len(prob_dict.keys())) 381 for x in prob_dict.keys(): 382 self._prob_dict[x] = logp 383 else: 384 for (x, p) in self._prob_dict.items(): 385 self._prob_dict[x] -= value_sum 386 else: 387 value_sum = sum(self._prob_dict.values()) 388 if value_sum == 0: 389 p = 1.0/len(prob_dict.keys()) 390 for x in prob_dict.keys(): 391 self._prob_dict[x] = p 392 else: 393 norm_factor = 1.0/value_sum 394 for (x, p) in self._prob_dict.items(): 395 self._prob_dict[x] *= norm_factor
396
397 - def prob(self, sample):
398 if self._log: 399 if sample not in self._prob_dict: return 0 400 else: return math.exp(self._prob_dict[sample]) 401 else: 402 return self._prob_dict.get(sample, 0)
403
404 - def logprob(self, sample):
405 if self._log: 406 return self._prob_dict.get(sample, _NINF) 407 else: 408 if sample not in self._prob_dict: return _NINF 409 else: return math.log(self._prob_dict[sample])
410
411 - def max(self):
412 if not hasattr(self, '_max'): 413 self._max = max((p,v) for (v,p) in self._prob_dict.items())[1] 414 return self._max
415 - def samples(self):
416 return self._prob_dict.keys()
417 - def __repr__(self):
418 return '<ProbDist with %d samples>' % len(self._prob_dict)
419
420 -class MLEProbDist(ProbDistI):
421 """ 422 The maximum likelihood estimate for the probability distribution 423 of the experiment used to generate a frequency distribution. The 424 X{maximum likelihood estimate} approximates the probability of 425 each sample as the frequency of that sample in the frequency 426 distribution. 427 """
428 - def __init__(self, freqdist):
429 """ 430 Use the maximum likelihood estimate to create a probability 431 distribution for the experiment used to generate C{freqdist}. 432 433 @type freqdist: C{FreqDist} 434 @param freqdist: The frequency distribution that the 435 probability estimates should be based on. 436 """ 437 if freqdist.N() == 0: 438 raise ValueError('An MLE probability distribution must '+ 439 'have at least one sample.') 440 441 self._freqdist = freqdist
442
443 - def freqdist(self):
444 """ 445 @return: The frequency distribution that this probability 446 distribution is based on. 447 @rtype: C{FreqDist} 448 """ 449 return self._freqdist
450
451 - def prob(self, sample):
452 return self._freqdist.freq(sample)
453
454 - def max(self):
455 return self._freqdist.max()
456
457 - def samples(self):
458 return self._freqdist.samples()
459
460 - def __repr__(self):
461 """ 462 @rtype: C{string} 463 @return: A string representation of this C{ProbDist}. 464 """ 465 return '<MLEProbDist based on %d samples>' % self._freqdist.N()
466
467 -class LidstoneProbDist(ProbDistI):
468 """ 469 The Lidstone estimate for the probability distribution of the 470 experiment used to generate a frequency distribution. The 471 C{Lidstone estimate} is paramaterized by a real number M{gamma}, 472 which typically ranges from 0 to 1. The X{Lidstone estimate} 473 approximates the probability of a sample with count M{c} from an 474 experiment with M{N} outcomes and M{B} bins as 475 M{(c+gamma)/(N+B*gamma)}. This is equivalant to adding 476 M{gamma} to the count for each bin, and taking the maximum 477 likelihood estimate of the resulting frequency distribution. 478 """
479 - def __init__(self, freqdist, gamma, bins=None):
480 """ 481 Use the Lidstone estimate to create a probability distribution 482 for the experiment used to generate C{freqdist}. 483 484 @type freqdist: C{FreqDist} 485 @param freqdist: The frequency distribution that the 486 probability estimates should be based on. 487 @type gamma: C{float} 488 @param gamma: A real number used to paramaterize the 489 estimate. The Lidstone estimate is equivalant to adding 490 M{gamma} to the count for each bin, and taking the 491 maximum likelihood estimate of the resulting frequency 492 distribution. 493 @type bins: C{int} 494 @param bins: The number of sample values that can be generated 495 by the experiment that is described by the probability 496 distribution. This value must be correctly set for the 497 probabilities of the sample values to sum to one. If 498 C{bins} is not specified, it defaults to C{freqdist.B()}. 499 """ 500 if (bins == 0) or (bins is None and freqdist.N() == 0): 501 name = self.__class__.__name__[:-8] 502 raise ValueError('A %s probability distribution ' % name + 503 'must have at least one bin.') 504 if (bins is not None) and (bins < freqdist.B()): 505 name = self.__class__.__name__[:-8] 506 raise ValueError('\nThe number of bins in a %s must be ' % name + 507 'greater than or equal to\nthe number of '+ 508 'bins in the FreqDist used to create it.') 509 510 self._freqdist = freqdist 511 self._gamma = float(gamma) 512 self._N = self._freqdist.N() 513 514 if bins is None: bins = freqdist.B() 515 self._bins = bins
516
517 - def freqdist(self):
518 """ 519 @return: The frequency distribution that this probability 520 distribution is based on. 521 @rtype: C{FreqDist} 522 """ 523 return self._freqdist
524
525 - def prob(self, sample):
526 c = self._freqdist.count(sample) 527 return (c + self._gamma) / (self._N + self._bins * self._gamma)
528
529 - def max(self):
530 # For Lidstone distributions, probability is monotonic with 531 # frequency, so the most probable sample is the one that 532 # occurs most frequently. 533 return self._freqdist.max()
534
535 - def samples(self):
536 return self._freqdist.samples()
537
538 - def __repr__(self):
539 """ 540 @rtype: C{string} 541 @return: A string representation of this C{ProbDist}. 542 """ 543 return '<LidstoneProbDist based on %d samples>' % self._freqdist.N()
544
545 -class LaplaceProbDist(LidstoneProbDist):
546 """ 547 The Laplace estimate for the probability distribution of the 548 experiment used to generate a frequency distribution. The 549 X{Lidstone estimate} approximates the probability of a sample with 550 count M{c} from an experiment with M{N} outcomes and M{B} bins as 551 M{(c+1)/(N+B)}. This is equivalant to adding one to the count for 552 each bin, and taking the maximum likelihood estimate of the 553 resulting frequency distribution. 554 """
555 - def __init__(self, freqdist, bins=None):
556 """ 557 Use the Laplace estimate to create a probability distribution 558 for the experiment used to generate C{freqdist}. 559 560 @type freqdist: C{FreqDist} 561 @param freqdist: The frequency distribution that the 562 probability estimates should be based on. 563 @type bins: C{int} 564 @param bins: The number of sample values that can be generated 565 by the experiment that is described by the probability 566 distribution. This value must be correctly set for the 567 probabilities of the sample values to sum to one. If 568 C{bins} is not specified, it defaults to C{freqdist.B()}. 569 """ 570 LidstoneProbDist.__init__(self, freqdist, 1, bins)
571
572 - def __repr__(self):
573 """ 574 @rtype: C{string} 575 @return: A string representation of this C{ProbDist}. 576 """ 577 return '<LaplaceProbDist based on %d samples>' % self._freqdist.N()
578
579 -class ELEProbDist(LidstoneProbDist):
580 """ 581 The expected likelihood estimate for the probability distribution 582 of the experiment used to generate a frequency distribution. The 583 X{expected likelihood estimate} approximates the probability of a 584 sample with count M{c} from an experiment with M{N} outcomes and 585 M{B} bins as M{(c+0.5)/(N+B/2)}. This is equivalant to adding 0.5 586 to the count for each bin, and taking the maximum likelihood 587 estimate of the resulting frequency distribution. 588 """
589 - def __init__(self, freqdist, bins=None):
590 """ 591 Use the expected likelihood estimate to create a probability 592 distribution for the experiment used to generate C{freqdist}. 593 594 @type freqdist: C{FreqDist} 595 @param freqdist: The frequency distribution that the 596 probability estimates should be based on. 597 @type bins: C{int} 598 @param bins: The number of sample values that can be generated 599 by the experiment that is described by the probability 600 distribution. This value must be correctly set for the 601 probabilities of the sample values to sum to one. If 602 C{bins} is not specified, it defaults to C{freqdist.B()}. 603 """ 604 LidstoneProbDist.__init__(self, freqdist, 0.5, bins)
605
606 - def __repr__(self):
607 """ 608 @rtype: C{string} 609 @return: A string representation of this C{ProbDist}. 610 """ 611 return '<ELEProbDist based on %d samples>' % self._freqdist.N()
612
613 -class HeldoutProbDist(ProbDistI):
614 """ 615 The heldout estimate for the probability distribution of the 616 experiment used to generate two frequency distributions. These 617 two frequency distributions are called the "heldout frequency 618 distribution" and the "base frequency distribution." The 619 X{heldout estimate} uses uses the X{heldout frequency 620 distribution} to predict the probability of each sample, given its 621 frequency in the X{base frequency distribution}. 622 623 In particular, the heldout estimate approximates the probability 624 for a sample that occurs M{r} times in the base distribution as 625 the average frequency in the heldout distribution of all samples 626 that occur M{r} times in the base distribution. 627 628 This average frequency is M{Tr[r]/(Nr[r]*N)}, where: 629 - M{Tr[r]} is the total count in the heldout distribution for 630 all samples that occur M{r} times in the base 631 distribution. 632 - M{Nr[r]} is the number of samples that occur M{r} times in 633 the base distribution. 634 - M{N} is the number of outcomes recorded by the heldout 635 frequency distribution. 636 637 In order to increase the efficiency of the C{prob} member 638 function, M{Tr[r]/(Nr[r]*N)} is precomputed for each value of M{r} 639 when the C{HeldoutProbDist} is created. 640 641 @type _estimate: C{list} of C{float} 642 @ivar _estimate: A list mapping from M{r}, the number of 643 times that a sample occurs in the base distribution, to the 644 probability estimate for that sample. C{_estimate[M{r}]} is 645 calculated by finding the average frequency in the heldout 646 distribution of all samples that occur M{r} times in the base 647 distribution. In particular, C{_estimate[M{r}]} = 648 M{Tr[r]/(Nr[r]*N)}. 649 @type _max_r: C{int} 650 @ivar _max_r: The maximum number of times that any sample occurs 651 in the base distribution. C{_max_r} is used to decide how 652 large C{_estimate} must be. 653 """
654 - def __init__(self, base_fdist, heldout_fdist, bins=None):
655 """ 656 Use the heldout estimate to create a probability distribution 657 for the experiment used to generate C{base_fdist} and 658 C{heldout_fdist}. 659 660 @type base_fdist: C{FreqDist} 661 @param base_fdist: The base frequency distribution. 662 @type heldout_fdist: C{FreqDist} 663 @param heldout_fdist: The heldout frequency distribution. 664 @type bins: C{int} 665 @param bins: The number of sample values that can be generated 666 by the experiment that is described by the probability 667 distribution. This value must be correctly set for the 668 probabilities of the sample values to sum to one. If 669 C{bins} is not specified, it defaults to C{freqdist.B()}. 670 """ 671 672 self._base_fdist = base_fdist 673 self._heldout_fdist = heldout_fdist 674 675 # The max number of times any sample occurs in base_fdist. 676 self._max_r = base_fdist.count(base_fdist.max()) 677 678 # Calculate Tr, Nr, and N. 679 Tr = self._calculate_Tr() 680 Nr = [base_fdist.Nr(r, bins) for r in range(self._max_r+1)] 681 N = heldout_fdist.N() 682 683 # Use Tr, Nr, and N to compute the probability estimate for 684 # each value of r. 685 self._estimate = self._calculate_estimate(Tr, Nr, N)
686
687 - def _calculate_Tr(self):
688 """ 689 @return: the list M{Tr}, where M{Tr[r]} is the total count in 690 C{heldout_fdist} for all samples that occur M{r} 691 times in C{base_fdist}. 692 @rtype: C{list} of C{float} 693 """ 694 Tr = [0.0] * (self._max_r+1) 695 for sample in self._heldout_fdist.samples(): 696 r = self._base_fdist.count(sample) 697 Tr[r] += self._heldout_fdist.count(sample) 698 return Tr
699
700 - def _calculate_estimate(self, Tr, Nr, N):
701 """ 702 @return: the list M{estimate}, where M{estimate[r]} is the 703 probability estimate for any sample that occurs M{r} times 704 in the base frequency distribution. In particular, 705 M{estimate[r]} is M{Tr[r]/(N[r]*N)}. In the special case 706 that M{N[r]=0}, M{estimate[r]} will never be used; so we 707 define M{estimate[r]=None} for those cases. 708 @rtype: C{list} of C{float} 709 @type Tr: C{list} of C{float} 710 @param Tr: the list M{Tr}, where M{Tr[r]} is the total count in 711 the heldout distribution for all samples that occur M{r} 712 times in base distribution. 713 @type Nr: C{list} of C{float} 714 @param Nr: The list M{Nr}, where M{Nr[r]} is the number of 715 samples that occur M{r} times in the base distribution. 716 @type N: C{int} 717 @param N: The total number of outcomes recorded by the heldout 718 frequency distribution. 719 """ 720 estimate = [] 721 for r in range(self._max_r+1): 722 if Nr[r] == 0: estimate.append(None) 723 else: estimate.append(Tr[r]/(Nr[r]*N)) 724 return estimate
725
726 - def base_fdist(self):
727 """ 728 @return: The base frequency distribution that this probability 729 distribution is based on. 730 @rtype: C{FreqDist} 731 """ 732 return self._base_fdist
733
734 - def heldout_fdist(self):
735 """ 736 @return: The heldout frequency distribution that this 737 probability distribution is based on. 738 @rtype: C{FreqDist} 739 """ 740 return self._heldout_fdist
741
742 - def prob(self, sample):
743 # Use our precomputed probability estimate. 744 r = self._base_fdist.count(sample) 745 return self._estimate[r]
746
747 - def max(self):
748 # Note: the Heldout estimation is *not* necessarily monotonic; 749 # so this implementation is currently broken. However, it 750 # should give the right answer *most* of the time. :) 751 return self._base_fdist.max()
752
753 - def __repr__(self):
754 """ 755 @rtype: C{string} 756 @return: A string representation of this C{ProbDist}. 757 """ 758 s = '<HeldoutProbDist: %d base samples; %d heldout samples>' 759 return s % (self._base_fdist.N(), self._heldout_fdist.N())
760
761 -class CrossValidationProbDist(ProbDistI):
762 """ 763 The cross-validation estimate for the probability distribution of 764 the experiment used to generate a set of frequency distribution. 765 The X{cross-validation estimate} for the probability of a sample 766 is found by averaging the held-out estimates for the sample in 767 each pair of frequency distributions. 768 """
769 - def __init__(self, freqdists, bins):
770 """ 771 Use the cross-validation estimate to create a probability 772 distribution for the experiment used to generate 773 C{freqdists}. 774 775 @type freqdists: C{list} of C{FreqDist} 776 @param freqdists: A list of the frequency distributions 777 generated by the experiment. 778 @type bins: C{int} 779 @param bins: The number of sample values that can be generated 780 by the experiment that is described by the probability 781 distribution. This value must be correctly set for the 782 probabilities of the sample values to sum to one. If 783 C{bins} is not specified, it defaults to C{freqdist.B()}. 784 """ 785 self._freqdists = freqdists 786 787 # Create a heldout probability distribution for each pair of 788 # frequency distributions in freqdists. 789 self._heldout_probdists = [] 790 for fdist1 in freqdists: 791 for fdist2 in freqdists: 792 if fdist1 is not fdist2: 793 probdist = HeldoutProbDist(fdist1, fdist2, bins) 794 self._heldout_probdists.append(probdist)
795
796 - def freqdists(self):
797 """ 798 @rtype: C{list} of C{FreqDist} 799 @return: The list of frequency distributions that this 800 C{ProbDist} is based on. 801 """ 802 return self._freqdists
803
804 - def prob(self, sample):
805 # Find the average probability estimate returned by each 806 # heldout distribution. 807 prob = 0.0 808 for heldout_probdist in self._heldout_probdists: 809 prob += heldout_probdist.prob(sample) 810 return prob/len(self._heldout_probdists)
811
812 - def __repr__(self):
813 """ 814 @rtype: C{string} 815 @return: A string representation of this C{ProbDist}. 816 """ 817 return '<CrossValidationProbDist: %d-way>' % len(self._freqdists)
818
819 -class WittenBellProbDist(ProbDistI):
820 """ 821 The Witten-Bell estimate of a probability distribution. This distribution 822 allocates uniform probability mass to as yet unseen events by using the 823 number of events that have only been seen once. The probability mass 824 reserved for unseen events is equal to: 825 826 - M{T / (N + T)} 827 828 where M{T} is the number of observed event types and M{N} is the total 829 number of observed events. This equates to the maximum likelihood estimate 830 of a new type event occuring. The remaining probability mass is discounted 831 such that all probability estimates sum to one, yielding: 832 833 - M{p = T / Z (N + T)}, if count = 0 834 - M{p = c / (N + T)}, otherwise 835 """ 836
837 - def __init__(self, freqdist, bins=None):
838 """ 839 Creates a distribution of Witten-Bell probability estimates. This 840 distribution allocates uniform probability mass to as yet unseen 841 events by using the number of events that have only been seen once. 842 The probability mass reserved for unseen events is equal to: 843 844 - M{T / (N + T)} 845 846 where M{T} is the number of observed event types and M{N} is the total 847 number of observed events. This equates to the maximum likelihood 848 estimate of a new type event occuring. The remaining probability mass 849 is discounted such that all probability estimates sum to one, 850 yielding: 851 852 - M{p = T / Z (N + T)}, if count = 0 853 - M{p = c / (N + T)}, otherwise 854 855 The parameters M{T} and M{N} are taken from the C{freqdist} parameter 856 (the C{B()} and C{N()} values). The normalising factor M{Z} is 857 calculated using these values along with the C{bins} parameter. 858 859 @param freqdist: The frequency counts upon which to base the 860 estimation. 861 @type freqdist: C{FreqDist} 862 @param bins: The number of possible event types. This must be 863 at least as large as the number of bins in the 864 C{freqdist}. If C{None}, then it's assumed to be 865 equal to that of the C{freqdist} 866 @type bins: C{Int} 867 """ 868 assert bins == None or bins >= freqdist.B(),\ 869 'Bins parameter must not be less than freqdist.B()' 870 if bins == None: 871 bins = freqdist.B() 872 self._freqdist = freqdist 873 self._T = self._freqdist.B() 874 self._Z = bins - self._freqdist.B() 875 self._N = self._freqdist.N()
876
877 - def prob(self, sample):
878 # inherit docs from ProbDistI 879 c = self._freqdist.count(sample) 880 if c == 0: 881 return self._T / float(self._Z * (self._N + self._T)) 882 else: 883 return c / float(self._N + self._T)
884
885 - def max(self):
886 return self._freqdist.max()
887
888 - def samples(self):
889 return self._freqdist.samples()
890
891 - def freqdist(self):
892 return self._freqdist
893
894 - def __repr__(self):
895 """ 896 @rtype: C{string} 897 @return: A string representation of this C{ProbDist}. 898 """ 899 return '<WittenBellProbDist based on %d samples>' % self._freqdist.N()
900
901 -class GoodTuringProbDist(ProbDistI):
902 """ 903 The Good-Turing estimate of a probability distribution. This method 904 calculates the probability mass to assign to events with zero or low 905 counts based on the number of events with higher counts. It does so by 906 using the smoothed count M{c*}: 907 908 - M{c* = (c + 1) N(c + 1) / N(c)} 909 910 where M{c} is the original count, M{N(i)} is the number of event types 911 observed with count M{i}. These smoothed counts are then normalised to 912 yield a probability distribution. 913 """ 914 # TODO - add a cut-off parameter, above which the counts are unmodified 915 # (see J&M p216) 916
917 - def __init__(self, freqdist, bins):
918 """ 919 Creates a Good-Turing probability distribution estimate. This method 920 calculates the probability mass to assign to events with zero or low 921 counts based on the number of events with higher counts. It does so by 922 using the smoothed count M{c*}: 923 924 - M{c* = (c + 1) N(c + 1) / N(c)} 925 926 where M{c} is the original count, M{N(i)} is the number of event types 927 observed with count M{i}. These smoothed counts are then normalised to 928 yield a probability distribution. 929 930 The C{bins} parameter allows C{N(0)} to be estimated. 931 932 @param freqdist: The frequency counts upon which to base the 933 estimation. 934 @type freqdist: C{FreqDist} 935 @param bins: The number of possible event types. This must be 936 at least as large as the number of bins in the 937 C{freqdist}. If C{None}, then it's taken to be 938 equal to C{freqdist.B()}. 939 @type bins: C{Int} 940 """ 941 assert bins == None or bins >= freqdist.B(),\ 942 'Bins parameter must not be less than freqdist.B()' 943 if bins == None: 944 bins = freqdist.B() 945 self._freqdist = freqdist 946 self._bins = bins
947
948 - def prob(self, sample):
949 # inherit docs from FreqDist 950 c = self._freqdist.count(sample) 951 nc = self._freqdist.Nr(c, self._bins) 952 ncn = self._freqdist.Nr(c + 1, self._bins) 953 954 # avoid divide-by-zero errors for sparse datasets 955 if nc == 0 or self._freqdist.N() == 0: 956 return 0.0 957 958 return float(c + 1) * ncn / (nc * self._freqdist.N())
959
960 - def max(self):
961 return self._freqdist.max()
962
963 - def samples(self):
964 return self._freqdist.samples()
965
966 - def freqdist(self):
967 return self._freqdist
968
969 - def __repr__(self):
970 """ 971 @rtype: C{string} 972 @return: A string representation of this C{ProbDist}. 973 """ 974 return '<GoodTuringProbDist based on %d samples>' % self._freqdist.N()
975
976 -class MutableProbDist(ProbDistI):
977 """ 978 An mutable probdist where the probabilities may be easily modified. This 979 simply copies an existing probdist, storing the probability values in a 980 mutable dictionary and providing an update method. 981 """ 982
983 - def __init__(self, prob_dist, samples, store_logs=True):
984 """ 985 Creates the mutable probdist based on the given prob_dist and using 986 the list of samples given. These values are stored as log 987 probabilities if the store_logs flag is set. 988 989 @param prob_dist: the distribution from which to garner the 990 probabilities 991 @type prob_dist: ProbDist 992 @param samples: the complete set of samples 993 @type samples: sequence of any 994 @param store_logs: whether to store the probabilities as logarithms 995 @type store_logs: bool 996 """ 997 try: 998 import numpy 999 except ImportError: 1000 print "Error: Please install numpy; for instructions see http://nltk.sf.net/install.html" 1001 exit() 1002 self._samples = samples 1003 self._sample_dict = dict((samples[i], i) for i in range(len(samples))) 1004 self._data = numpy.zeros(len(samples), numpy.float64) 1005 for i in range(len(samples)): 1006 if store_logs: 1007 self._data[i] = prob_dist.logprob(samples[i]) 1008 else: 1009 self._data[i] = prob_dist.prob(samples[i]) 1010 self._logs = store_logs
1011
1012 - def samples(self):
1013 # inherit documentation 1014 return self._samples
1015
1016 - def prob(self, sample):
1017 # inherit documentation 1018 i = self._sample_dict.get(sample) 1019 if i != None: 1020 if self._logs: 1021 return exp(self._data[i]) 1022 else: 1023 return self._data[i] 1024 else: 1025 return 0.0
1026
1027 - def logprob(self, sample):
1028 # inherit documentation 1029 i = self._sample_dict.get(sample) 1030 if i != None: 1031 if self._logs: 1032 return self._data[i] 1033 else: 1034 return log(self._data[i]) 1035 else: 1036 return float('-inf')
1037
1038 - def update(self, sample, prob, log=True):
1039 """ 1040 Update the probability for the given sample. This may cause the object 1041 to stop being the valid probability distribution - the user must 1042 ensure that they update the sample probabilities such that all samples 1043 have probabilities between 0 and 1 and that all probabilities sum to 1044 one. 1045 1046 @param sample: the sample for which to update the probability 1047 @type sample: C{any} 1048 @param prob: the new probability 1049 @type prob: C{float} 1050 @param log: is the probability already logged 1051 @type log: C{bool} 1052 """ 1053 i = self._sample_dict.get(sample) 1054 assert i != None 1055 if self._logs: 1056 if log: self._data[i] = prob 1057 else: self._data[i] = log(prob) 1058 else: 1059 if log: self._data[i] = exp(prob) 1060 else: self._data[i] = prob
1061 1062 ##////////////////////////////////////////////////////// 1063 ## Probability Distribution Operations 1064 ##////////////////////////////////////////////////////// 1065
1066 -def log_likelihood(test_pdist, actual_pdist):
1067 # Is this right? 1068 return sum(actual_pdist.prob(s) * math.log(test_pdist.prob(s)) 1069 for s in actual_pdist.samples())
1070 1071 ##////////////////////////////////////////////////////// 1072 ## Conditional Distributions 1073 ##////////////////////////////////////////////////////// 1074
1075 -class ConditionalFreqDist(object):
1076 """ 1077 A collection of frequency distributions for a single experiment 1078 run under different conditions. Conditional frequency 1079 distributions are used to record the number of times each sample 1080 occured, given the condition under which the experiment was run. 1081 For example, a conditional frequency distribution could be used to 1082 record the frequency of each word (type) in a document, given its 1083 length. Formally, a conditional frequency distribution can be 1084 defined as a function that maps from each condition to the 1085 C{FreqDist} for the experiment under that condition. 1086 1087 The frequency distribution for each condition is accessed using 1088 the indexing operator: 1089 1090 >>> cfdist[3] 1091 <FreqDist with 73 outcomes> 1092 >>> cfdist[3].freq('the') 1093 0.4 1094 >>> cfdist[3].count('dog') 1095 2 1096 1097 When the indexing operator is used to access the frequency 1098 distribution for a condition that has not been accessed before, 1099 C{ConditionalFreqDist} creates a new empty C{FreqDist} for that 1100 condition. 1101 1102 Conditional frequency distributions are typically constructed by 1103 repeatedly running an experiment under a variety of conditions, 1104 and incrementing the sample outcome counts for the appropriate 1105 conditions. For example, the following code will produce a 1106 conditional frequency distribution that encodes how often each 1107 word type occurs, given the length of that word type: 1108 1109 >>> cfdist = ConditionalFreqDist() 1110 >>> for word in tokenize.whitespace(sent): 1111 ... condition = len(word) 1112 ... cfdist[condition].inc(word) 1113 """
1114 - def __init__(self):
1115 """ 1116 Construct a new empty conditional frequency distribution. In 1117 particular, the count for every sample, under every condition, 1118 is zero. 1119 """ 1120 self._fdists = {}
1121
1122 - def __getitem__(self, condition):
1123 """ 1124 Return the frequency distribution that encodes the frequency 1125 of each sample outcome, given that the experiment was run 1126 under the given condition. If the frequency distribution for 1127 the given condition has not been accessed before, then this 1128 will create a new empty C{FreqDist} for that condition. 1129 1130 @return: The frequency distribution that encodes the frequency 1131 of each sample outcome, given that the experiment was run 1132 under the given condition. 1133 @rtype: C{FreqDist} 1134 1135 @param condition: The condition under which the experiment was 1136 run. 1137 @type condition: any 1138 """ 1139 # Create the conditioned freq dist, if it doesn't exist 1140 if not self._fdists.has_key(condition): 1141 self._fdists[condition] = FreqDist() 1142 1143 return self._fdists[condition]
1144
1145 - def conditions(self):
1146 """ 1147 @return: A list of the conditions that have been accessed for 1148 this C{ConditionalFreqDist}. Use the indexing operator to 1149 access the frequency distribution for a given condition. 1150 Note that the frequency distributions for some conditions 1151 may contain zero sample outcomes. 1152 @rtype: C{list} 1153 """ 1154 return self._fdists.keys()
1155
1156 - def __repr__(self):
1157 """ 1158 @return: A string representation of this 1159 C{ConditionalFreqDist}. 1160 @rtype: C{string} 1161 """ 1162 n = len(self._fdists) 1163 return '<ConditionalFreqDist with %d conditions>' % n
1164
1165 -class ConditionalProbDistI(object):
1166 """ 1167 A collection of probability distributions for a single experiment 1168 run under different conditions. Conditional probability 1169 distributions are used to estimate the likelihood of each sample, 1170 given the condition under which the experiment was run. For 1171 example, a conditional probability distribution could be used to 1172 estimate the probability of each word type in a document, given 1173 the length of the word type. Formally, a conditional probability 1174 distribution can be defined as a function that maps from each 1175 condition to the C{ProbDist} for the experiment under that 1176 condition. 1177 """
1178 - def __init__(self):
1179 raise AssertionError, 'ConditionalProbDistI is an interface'
1180
1181 - def __getitem__(self, condition):
1182 """ 1183 @return: The probability distribution for the experiment run 1184 under the given condition. 1185 @rtype: C{ProbDistI} 1186 @param condition: The condition whose probability distribution 1187 should be returned. 1188 @type condition: any 1189 """ 1190 raise AssertionError
1191
1192 - def conditions(self):
1193 """ 1194 @return: A list of the conditions that are represented by 1195 this C{ConditionalProbDist}. Use the indexing operator to 1196 access the probability distribution for a given condition. 1197 @rtype: C{list} 1198 """ 1199 raise AssertionError
1200 1201 # For now, this is the only implementation of ConditionalProbDistI; 1202 # but we would want a different implementation if we wanted to build a 1203 # conditional probability distribution analytically (e.g., a gaussian 1204 # distribution), rather than basing it on an underlying frequency 1205 # distribution.
1206 -class ConditionalProbDist(ConditionalProbDistI):
1207 """ 1208 A conditional probability distribution modelling the experiments 1209 that were used to generate a conditional frequency distribution. 1210 A C{ConditoinalProbDist} is constructed from a 1211 C{ConditionalFreqDist} and a X{C{ProbDist} factory}: 1212 1213 - The B{C{ConditionalFreqDist}} specifies the frequency 1214 distribution for each condition. 1215 - The B{C{ProbDist} factory} is a function that takes a 1216 condition's frequency distribution, and returns its 1217 probability distribution. A C{ProbDist} class's name (such as 1218 C{MLEProbDist} or C{HeldoutProbDist}) can be used to specify 1219 that class's constructor. 1220 1221 The first argument to the C{ProbDist} factory is the frequency 1222 distribution that it should model; and the remaining arguments are 1223 specified by the C{factory_args} parameter to the 1224 C{ConditionalProbDist} constructor. For example, the following 1225 code constructs a C{ConditionalProbDist}, where the probability 1226 distribution for each condition is an C{ELEProbDist} with 10 bins: 1227 1228 >>> cpdist = ConditionalProbDist(cfdist, ELEProbDist, 10) 1229 >>> print cpdist['run'].max() 1230 'NN' 1231 >>> print cpdist['run'].prob('NN') 1232 0.0813 1233 """
1234 - def __init__(self, cfdist, probdist_factory, 1235 supply_condition=False, *factory_args):
1236 """ 1237 Construct a new conditional probability distribution, based on 1238 the given conditional frequency distribution and C{ProbDist} 1239 factory. 1240 1241 @type cfdist: L{ConditionalFreqDist} 1242 @param cfdist: The C{ConditionalFreqDist} specifying the 1243 frequency distribution for each condition. 1244 @type probdist_factory: C{class} or C{function} 1245 @param probdist_factory: The function or class that maps 1246 a condition's frequency distribution to its probability 1247 distribution. The function is called with the frequency 1248 distribution as its first argument, the condition as its 1249 second argument (only if C{supply_condition=True}), and 1250 C{factory_args} as its remaining arguments. 1251 @type supply_condition: C{bool} 1252 @param supply_condition: If true, then pass the condition as 1253 the second argument to C{probdist_factory}. 1254 @type factory_args: (any) 1255 @param factory_args: Extra arguments for C{probdist_factory}. 1256 These arguments are usually used to specify extra 1257 properties for the probability distributions of individual 1258 conditions, such as the number of bins they contain. 1259 """ 1260 self._probdist_factory = probdist_factory 1261 self._cfdist = cfdist 1262 self._supply_condition = supply_condition 1263 self._factory_args = factory_args 1264 1265 self._pdists = {} 1266 for c in cfdist.conditions(): 1267 if supply_condition: 1268 pdist = probdist_factory(cfdist[c], c, *factory_args) 1269 else: 1270 pdist = probdist_factory(cfdist[c], *factory_args) 1271 self._pdists[c] = pdist
1272
1273 - def __getitem__(self, condition):
1274 if not self._pdists.has_key(condition): 1275 # If it's a condition we haven't seen, create a new prob 1276 # dist from the empty freq dist. Typically, this will 1277 # give a uniform prob dist. 1278 pdist = self._probdist_factory(FreqDist(), *self._factory_args) 1279 self._pdists[condition] = pdist 1280 1281 return self._pdists[condition]
1282
1283 - def conditions(self):
1284 return self._pdists.keys()
1285
1286 - def __repr__(self):
1287 """ 1288 @return: A string representation of this 1289 C{ConditionalProbDist}. 1290 @rtype: C{string} 1291 """ 1292 n = len(self._pdists) 1293 return '<ConditionalProbDist with %d conditions>' % n
1294
1295 -class DictionaryConditionalProbDist(ConditionalProbDistI):
1296 """ 1297 An alternative ConditionalProbDist that simply wraps a dictionary of 1298 ProbDists rather than creating these from FreqDists. 1299 """ 1300
1301 - def __init__(self, probdist_dict):
1302 """ 1303 @param probdist_dict: a dictionary containing the probdists indexed 1304 by the conditions 1305 @type probdist_dict: dict any -> probdist 1306 """ 1307 self._dict = probdist_dict
1308
1309 - def __getitem__(self, condition):
1310 # inherit documentation 1311 # this will cause an exception for unseen conditions 1312 return self._dict[condition]
1313
1314 - def conditions(self):
1315 # inherit documentation 1316 return self._dict.keys()
1317 1318 ##////////////////////////////////////////////////////// 1319 ## Adding in log-space. 1320 ##////////////////////////////////////////////////////// 1321 1322 # If the difference is bigger than this, then just take the bigger one: 1323 _ADD_LOGS_MAX_DIFF = math.log(1e-30) 1324
1325 -def add_logs(logx, logy):
1326 """ 1327 Given two numbers C{logx}=M{log(x)} and C{logy}=M{log(y)}, return 1328 M{log(x+y)}. Conceptually, this is the same as returning 1329 M{log(exp(C{logx})+exp(C{logy}))}, but the actual implementation 1330 avoids overflow errors that could result from direct computation. 1331 """ 1332 if (logx < logy + _ADD_LOGS_MAX_DIFF): 1333 return logy 1334 if (logy < logx + _ADD_LOGS_MAX_DIFF): 1335 return logx 1336 base = min(logx, logy) 1337 return base + math.log(math.exp(logx-base) + math.exp(logy-base))
1338
1339 -def sum_logs(logs):
1340 if len(logs) == 0: 1341 # Use some approximation to infinity. What this does 1342 # depends on your system's float implementation. 1343 return _NINF 1344 else: 1345 return reduce(add_logs, logs[1:], logs[0])
1346 1347 ##////////////////////////////////////////////////////// 1348 ## Probabilistic Mix-in 1349 ##////////////////////////////////////////////////////// 1350
1351 -class ProbabilisticMixIn(object):
1352 """ 1353 A mix-in class to associate probabilities with other classes 1354 (trees, rules, etc.). To use the C{ProbabilisticMixIn} class, 1355 define a new class that derives from an existing class and from 1356 ProbabilisticMixIn. You will need to define a new constructor for 1357 the new class, which explicitly calls the constructors of both its 1358 parent classes. For example: 1359 1360 >>> class A: 1361 ... def __init__(self, x, y): self.data = (x,y) 1362 ... 1363 >>> class ProbabilisticA(A, ProbabilisticMixIn): 1364 ... def __init__(self, x, y, **prob_kwarg): 1365 ... A.__init__(self, x, y) 1366 ... ProbabilisticMixIn.__init__(self, **prob_kwarg) 1367 1368 See the documentation for the ProbabilisticMixIn 1369 L{constructor<__init__>} for information about the arguments it 1370 expects. 1371 1372 You should generally also redefine the string representation 1373 methods, the comparison methods, and the hashing method. 1374 """
1375 - def __init__(self, **kwargs):
1376 """ 1377 Initialize this object's probability. This initializer should 1378 be called by subclass constructors. C{prob} should generally be 1379 the first argument for those constructors. 1380 1381 @kwparam prob: The probability associated with the object. 1382 @type prob: C{float} 1383 @kwparam logprob: The log of the probability associated with 1384 the object. 1385 @type logprob: C{float} 1386 """ 1387 if 'prob' in kwargs: 1388 if 'logprob' in kwargs: 1389 raise TypeError('Must specify either prob or logprob ' 1390 '(not both)') 1391 else: 1392 ProbabilisticMixIn.set_prob(self, kwargs['prob']) 1393 elif 'logprob' in kwargs: 1394 ProbabilisticMixIn.set_logprob(self, kwargs['logprob']) 1395 else: 1396 self.__prob = self.__logprob = None
1397
1398 - def set_prob(self, prob):
1399 """ 1400 Set the probability associated with this object to C{prob}. 1401 @param prob: The new probability 1402 @type prob: C{float} 1403 """ 1404 self.__prob = prob 1405 self.__logprob = None
1406
1407 - def set_logprob(self, logprob):
1408 """ 1409 Set the log probability associated with this object to 1410 C{logprob}. I.e., set the probability associated with this 1411 object to C{exp(logprob)}. 1412 @param logprob: The new log probability 1413 @type logprob: C{float} 1414 """ 1415 self.__logprob = prob 1416 self.__prob = None
1417
1418 - def prob(self):
1419 """ 1420 @return: The probability associated with this object. 1421 @rtype: C{float} 1422 """ 1423 if self.__prob is None: 1424 if self.__logprob is None: return None 1425 self.__prob = math.exp(self.__logprob) 1426 return self.__prob
1427
1428 - def logprob(self):
1429 """ 1430 @return: C{log(p)}, where C{p} is the probability associated 1431 with this object. 1432 1433 @rtype: C{float} 1434 """ 1435 if self.__logprob is None: 1436 if self.__prob is None: return None 1437 self.__logprob = math.log(self.__prob) 1438 return self.__logprob
1439
1440 -class ImmutableProbabilisticMixIn(ProbabilisticMixIn):
1441 - def set_prob(self, prob):
1442 raise ValueError, '%s is immutable' % self.__class__.__name__
1443 - def set_logprob(self, prob):
1444 raise ValueError, '%s is immutable' % self.__class__.__name__
1445 1446 ##////////////////////////////////////////////////////// 1447 ## Demonstration 1448 ##////////////////////////////////////////////////////// 1449
1450 -def _create_rand_fdist(numsamples, numoutcomes):
1451 """ 1452 Create a new frequency distribution, with random samples. The 1453 samples are numbers from 1 to C{numsamples}, and are generated by 1454 summing two numbers, each of which has a uniform distribution. 1455 """ 1456 import random 1457 from math import sqrt 1458 fdist = FreqDist() 1459 for x in range(numoutcomes): 1460 y = (random.randint(1, (1+numsamples)/2) + 1461 random.randint(0, numsamples/2)) 1462 fdist.inc(y) 1463 return fdist
1464
1465 -def _create_sum_pdist(numsamples):
1466 """ 1467 Return the true probability distribution for the experiment 1468 C{_create_rand_fdist(numsamples, x)}. 1469 """ 1470 fdist = FreqDist() 1471 for x in range(1, (1+numsamples)/2+1): 1472 for y in range(0, numsamples/2+1): 1473 fdist.inc(x+y) 1474 return MLEProbDist(fdist)
1475
1476 -def demo(numsamples=6, numoutcomes=500):
1477 """ 1478 A demonstration of frequency distributions and probability 1479 distributions. This demonstration creates three frequency 1480 distributions with, and uses them to sample a random process with 1481 C{numsamples} samples. Each frequency distribution is sampled 1482 C{numoutcomes} times. These three frequency distributions are 1483 then used to build six probability distributions. Finally, the 1484 probability estimates of these distributions are compared to the 1485 actual probability of each sample. 1486 1487 @type numsamples: C{int} 1488 @param numsamples: The number of samples to use in each demo 1489 frequency distributions. 1490 @type numoutcomes: C{int} 1491 @param numoutcomes: The total number of outcomes for each 1492 demo frequency distribution. These outcomes are divided into 1493 C{numsamples} bins. 1494 @rtype: C{None} 1495 """ 1496 1497 # Randomly sample a stochastic process three times. 1498 fdist1 = _create_rand_fdist(numsamples, numoutcomes) 1499 fdist2 = _create_rand_fdist(numsamples, numoutcomes) 1500 fdist3 = _create_rand_fdist(numsamples, numoutcomes) 1501 1502 # Use our samples to create probability distributions. 1503 pdists = [ 1504 MLEProbDist(fdist1), 1505 LidstoneProbDist(fdist1, 0.5, numsamples), 1506 HeldoutProbDist(fdist1, fdist2, numsamples), 1507 HeldoutProbDist(fdist2, fdist1, numsamples), 1508 CrossValidationProbDist([fdist1, fdist2, fdist3], numsamples), 1509 _create_sum_pdist(numsamples), 1510 ] 1511 1512 # Find the probability of each sample. 1513 vals = [] 1514 for n in range(1,numsamples+1): 1515 vals.append(tuple([n, fdist1.freq(n)] + 1516 [pdist.prob(n) for pdist in pdists])) 1517 1518 # Print the results in a formatted table. 1519 print ('%d samples (1-%d); %d outcomes were sampled for each FreqDist' % 1520 (numsamples, numsamples, numoutcomes)) 1521 print '='*9*(len(pdists)+2) 1522 FORMATSTR = ' FreqDist '+ '%8s '*(len(pdists)-1) + '| Actual' 1523 print FORMATSTR % tuple(`pdist`[1:9] for pdist in pdists[:-1]) 1524 print '-'*9*(len(pdists)+2) 1525 FORMATSTR = '%3d %8.6f ' + '%8.6f '*(len(pdists)-1) + '| %8.6f' 1526 for val in vals: 1527 print FORMATSTR % val 1528 1529 # Print the totals for each column (should all be 1.0) 1530 zvals = zip(*vals) 1531 def sum(lst): return reduce(lambda x,y:x+y, lst, 0) 1532 sums = [sum(val) for val in zvals[1:]] 1533 print '-'*9*(len(pdists)+2) 1534 FORMATSTR = 'Total ' + '%8.6f '*(len(pdists)) + '| %8.6f' 1535 print FORMATSTR % tuple(sums) 1536 print '='*9*(len(pdists)+2) 1537 1538 # Display the distributions themselves, if they're short enough. 1539 if len(`str(fdist1)`) < 70: 1540 print ' fdist1:', str(fdist1) 1541 print ' fdist2:', str(fdist2) 1542 print ' fdist3:', str(fdist3) 1543 print
1544 1545 if __name__ == '__main__': 1546 demo(6, 10) 1547 demo(5, 5000) 1548