Package nltk_lite :: Package contrib :: Package classifier :: Module discretise
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.classifier.discretise

  1  # Natural Language Toolkit - Discretise 
  2  #  The command line entry point to discretisers 
  3  # 
  4  # Author: Sumukh Ghodke <sumukh dot ghodke at gmail dot com> 
  5  # 
  6  # URL: <http://nltk.sf.net> 
  7  # This software is distributed under GPL, for license information see LICENSE.TXT 
  8  from nltk_lite.contrib.classifier import split_ignore_space 
  9  from nltk_lite.contrib.classifier import instances as ins, discretisedattribute as da, cfile as f, numrange as r, format, commandline as cl 
 10  from nltk_lite.contrib.classifier.exceptions import filenotfounderror as fnf, invaliddataerror as inv 
 11  import sys 
 12   
 13  a_help = "Selects the discretisation algorithm                 " \ 
 14         + "Options: UEW for Unsupervised Equal Width            " \ 
 15         + "         UEF for Unsupervised Equal Frequency        " \ 
 16         + "         NS for Naive Supervised                     " \ 
 17         + "         NS1 for Naive Supervised version 1          " \ 
 18         + "         NS2 for Naive Supervised version 2          " \ 
 19         + "         ES for Entropy Based Supervised             " \ 
 20         + "Default: UEW." 
 21   
 22  f_help = "Base name of attribute, klass, training, test and gold" \ 
 23         + " files.                                              " 
 24   
 25  t_help = "Base name of training file for discretisation.       " 
 26   
 27  T_help = "Base name of test file to be discterised.            " 
 28   
 29  g_help = "Base name of gold file to be discretised.            " 
 30   
 31  A_help = "Comma separated list of attribute indices.           " 
 32   
 33  o_help = "Algorithm specific options                           " \ 
 34         + "UEW: Comma separated list of number of parts in which" \ 
 35         + "     each attribute should be split.                 " 
 36   
 37  UNSUPERVISED_EQUAL_WIDTH = 'UEW' 
 38  UNSUPERVISED_EQUAL_FREQUENCY = 'UEF' 
 39  NAIVE_SUPERVISED = 'NS' 
 40  NAIVE_SUPERVISED_V1 = 'NS1' 
 41  NAIVE_SUPERVISED_V2 = 'NS2' 
 42  ENTROPY_BASED_SUPERVISED = 'ES' 
 43   
 44  ALGORITHM_MAPPINGS = {UNSUPERVISED_EQUAL_WIDTH : 'unsupervised_equal_width', \ 
 45                       UNSUPERVISED_EQUAL_FREQUENCY : 'unsupervised_equal_frequency', \ 
 46                       NAIVE_SUPERVISED : 'naive_supervised', \ 
 47                       NAIVE_SUPERVISED_V1 : 'naive_supervised_v1', \ 
 48                       NAIVE_SUPERVISED_V2 : 'naive_supervised_v2', \ 
 49                       ENTROPY_BASED_SUPERVISED : 'entropy_based_supervised'} 
 50   
 51   
52 -class Discretise(cl.CommandLineInterface):
53 - def __init__(self):
54 cl.CommandLineInterface.__init__(self, ALGORITHM_MAPPINGS.keys(), UNSUPERVISED_EQUAL_WIDTH, a_help, f_help, t_help, T_help, g_help) 55 self.add_option("-A", "--attributes", dest="attributes", type="string", help=A_help) 56 self.add_option("-o", "--options", dest="options", type="string", help=o_help)
57
58 - def execute(self):
59 cl.CommandLineInterface.execute(self) 60 self.attributes_indices = self.get_value('attributes') 61 self.options = self.get_value('options') 62 self.validate_basic_arguments_are_present() 63 self.validate_files_arg_is_exclusive() 64 65 if not self.algorithm == NAIVE_SUPERVISED and self.options is None: 66 self.error("Invalid arguments. One or more required arguments are not present.") 67 self.discretise_and_write_to_file()
68
69 - def discretise_and_write_to_file(self):
70 ignore_missing = False 71 #duplicate code and not tested!! 72 if self.files is not None: 73 self.training_path, self.test_path, self.gold_path = [self.files] * 3 74 ignore_missing = True 75 training, attributes, klass, test, gold = self.get_instances(self.training_path, self.test_path, self.gold_path, ignore_missing) 76 disc = Discretiser(training, attributes, klass, test, gold, cl.as_integers('Attribute indices', self.attributes_indices), cl.as_integers('Options', self.options)) 77 getattr(disc, ALGORITHM_MAPPINGS[self.algorithm])() 78 files_written = self.write_to_file(self.get_suffix(), training, attributes, klass, test, gold) 79 print 'The following files were created with discretised values...' 80 for file_name in files_written: 81 print file_name
82
83 - def get_suffix(self):
84 indices_str = '' 85 indices = self.attributes_indices.split(',') 86 for index in indices: 87 indices_str += '_' + str(index.strip()) 88 return '-d' + indices_str
89
90 -class Discretiser:
91 - def __init__(self, training, attributes, klass, test, gold, attribute_indices, options = None):
92 self.training, self.attributes, self.klass, self.test, self.gold = training, attributes, klass, test, gold 93 self.attribute_indices, self.options = attribute_indices, options 94 self.__validate_attribute_indices() 95 self.__validate_options() 96 97 self.subset = self.attributes.subset(self.attribute_indices)
98
99 - def __validate_options(self):
100 if self.options is None: return 101 for option in self.options: 102 if option == 0: 103 raise inv.InvalidDataError('Option cannot be equal to zero.')
104
106 for index in self.attribute_indices: 107 if index < 0 or index >= len(self.attributes): 108 raise inv.InvalidDataError('Attribute indices should be between 0 and ' + str(len(self.attributes) - 1) + ' both inclusive, but found ' + str(index))
109
110 - def unsupervised_equal_width(self):
111 ranges = self.training.value_ranges(self.subset) 112 disc_attrs = self.discretised_attributes(ranges) 113 self.__discretise(disc_attrs)
114
115 - def __discretise(self, disc_attrs):
116 self.training.discretise(disc_attrs) 117 if self.test is not None: self.test.discretise(disc_attrs) 118 if self.gold is not None: self.gold.discretise(disc_attrs) 119 self.attributes.discretise(disc_attrs)
120
122 values_array = self.training.values_grouped_by_attribute(self.subset) 123 disc_attrs = [] 124 for index in range(len(self.subset)): 125 values = values_array[index] 126 values.sort() 127 attribute = self.subset[index] 128 ranges = ranges_from_chunks(get_chunks_with_frequency(values, self.options[index])) 129 disc_attrs.append(da.DiscretisedAttribute(attribute.name, ranges, attribute.index)) 130 self.__discretise(disc_attrs)
131
132 - def naive_supervised(self):
133 self.__supervised_discretisation(lambda breakpoints, index: breakpoints.find_naive())
134
135 - def naive_supervised_v1(self):
136 self.__supervised_discretisation(lambda breakpoints, index: breakpoints.find_naive_v1(self.options[index]))
137
138 - def naive_supervised_v2(self):
139 self.__supervised_discretisation(lambda breakpoints, index: breakpoints.find_naive_v2(self.options[index]))
140
141 - def entropy_based_supervised(self):
142 self.__supervised_discretisation(lambda breakpoints, index: breakpoints.find_entropy_based_max_depth(self.options[index]))
143
144 - def __supervised_discretisation(self, action):
145 disc_attrs = [] 146 for index in range(len(self.subset)): 147 attribute = self.subset[index] 148 breakpoints = self.training.supervised_breakpoints(attribute) 149 action(breakpoints, index) 150 disc_attrs.append(da.DiscretisedAttribute(attribute.name, breakpoints.as_ranges(), attribute.index)) 151 self.__discretise(disc_attrs)
152
153 - def discretised_attributes(self, ranges):
154 discretised_attributes = [] 155 for index in range(len(self.options)): 156 _range, width, attribute = ranges[index], self.options[index], self.subset[index] 157 discretised_attributes.append(da.DiscretisedAttribute(attribute.name, _range.split(width), attribute.index)) 158 return discretised_attributes
159
160 -def get_chunks_with_frequency(values, freq):
161 chunks = [] 162 while len(values) > 0: 163 chunk = values[:freq] 164 chunks.append(chunk) 165 values = values[freq:] 166 while len(values) > 0 and chunk[-1] == values[0]: 167 values = values[1:] 168 return chunks
169
170 -def ranges_from_chunks(chunks):
171 ranges = [] 172 for index in range(len(chunks) - 1): 173 ranges.append(r.Range(chunks[index][0], chunks[index + 1][0])) 174 ranges.append(r.Range(chunks[-1][0], chunks[-1][-1], True)) 175 return ranges
176 177 if __name__ == "__main__": 178 Discretise().run(sys.argv[1:]) 179