Home | Trees | Indices | Help |
|
---|
|
1 # Natural Language Toolkit - Instances 2 # Understands the creation and validation of instances from input file path 3 # 4 # Author: Sumukh Ghodke <sumukh dot ghodke at gmail dot com> 5 # 6 # URL: <http://nltk.sf.net> 7 # This software is distributed under GPL, for license information see LICENSE.TXT 8 9 from nltk_lite.contrib.classifier import instance as ins, item, cfile, confusionmatrix as cm, numrange as r 10 from nltk_lite.contrib.classifier.exceptions import systemerror as system, invaliddataerror as inv 11 from nltk_lite import probability as prob 12 import operator, UserList 13173119 for instance in self.data: 20 if not instance.is_valid(klass, attributes): 21 return False 22 return True23 2735100 108 12137 new_instances = TrainingInstances([]) 38 for instance in self.data: 39 if(instance.value(attribute) == attr_value): 40 new_instances.append(instance) 41 return new_instances4244 """ 45 Returns an array of range objects, in which each corresponds to the range of values an 46 attribute in the attributes parameter can take. 47 len(returned range array) is equal to len(attributes) 48 """ 49 ranges = [] 50 for attribute in attributes: 51 if not attribute.is_continuous(): 52 raise inv.InvalidDataError('Cannot discretise non continuous attribute ' + attribute.name) 53 values = self.values_grouped_by_attribute(attributes) 54 for value in values: #each entry in values is the range of values for a particular attribute 55 value.sort() 56 ranges.append(r.Range(value[0], value[-1], True)) 57 return ranges5860 """ 61 Returns an array where each element is an array of attribute values for a particular attribute 62 len(returned array) is equal to len(attributes) 63 """ 64 values = [] 65 for attribute in attributes: 66 _vals_in_attr = [] 67 for instance in self.data: 68 if attribute.is_continuous(): 69 _vals_in_attr.append(float(instance.value(attribute))) 70 else: 71 _vals_in_attr.append(instance.value(attribute)) 72 values.append(_vals_in_attr) 73 return values74 8082 values = [] 83 for instance in self.data: 84 values.append(instance.klass_value) 85 return values8688 self.sort_by(attribute) 89 attr_values = self.attribute_values(attribute) 90 return SupervisedBreakpoints(self.klass_values(), attr_values)9193 values = [] 94 for instance in self.data: 95 values.append(instance.value(attribute)) 96 return values97123 """ 124 Used to find breakpoints for discretisation 125 """220127 UserList.UserList.__init__(self, []) 128 self.attr_values = attr_values 129 self.klass_values = klass_values130 134136 frequencies = prob.FreqDist() 137 for index in range(len(self.klass_values) - 1): 138 frequencies.inc(self.klass_values[index]) 139 if frequencies.count(frequencies.max()) >= min_size: 140 self.append(index) 141 frequencies = prob.FreqDist()142 146 150152 breakpoints = [] 153 if len(klass_values) <= 1: return breakpoints 154 from nltk_lite.contrib.classifier import min_entropy_breakpoint 155 position, entropy = min_entropy_breakpoint(klass_values) 156 if abs(entropy) == 0: return breakpoints 157 breakpoints.append(position) 158 first, second = klass_values[:position+1], klass_values[position+1:] 159 if depth < self.max_depth: 160 breakpoints.extend(self.__find_breakpoints(first, depth + 1)) 161 breakpoints.extend([position + 1 + x for x in self.__find_breakpoints(second, depth + 1)]) 162 return breakpoints163165 """ 166 Returns an array of indices where the class membership changes from one value to another 167 the indicies will always lie between 0 and one less than number of instance, both inclusive. 168 """ 169 breakpoints= [] 170 for index in range(len(self.klass_values) - 1): 171 if self.klass_values[index] != self.klass_values[index + 1]: 172 breakpoints.append(index) 173 return breakpoints174176 prev = -1 177 self.sort() 178 to_remove,frequencies = [], prob.FreqDist() 179 for breakpoint in self.data: 180 frequencies.inc(self.klass_values[breakpoint], breakpoint - prev) 181 if frequencies.count(frequencies.max()) < min_size: 182 to_remove.append(breakpoint) 183 else: 184 frequencies = prob.FreqDist() 185 prev = breakpoint 186 for item in to_remove: 187 self.remove(item)188190 to_be_removed = [] 191 for index in range(len(self.data)): 192 i = index 193 while i < len(self.data) - 1 and (self.attr_values[self.data[i]] == self.attr_values[self.data[i] + 1]): 194 #The last and second last elements have the same attribute value or is equal to next breakpoint? 195 if self.data[i] == len(self.attr_values) - 2 or (index < len(self.data) - 1 and self.data[i] == self.data[index + 1]): 196 to_be_removed.append(self.data[i]) 197 break 198 self.data[i] += 1 199 i += 1 200 if index == len(self.data) - 1:#last breakpoint 201 breakpoint = self.data[index] 202 while breakpoint < len(self.attr_values) - 1 and self.attr_values[breakpoint] == self.attr_values[breakpoint + 1]: 203 self.data[index] += 1 204 if self.data[index] == len(self.attr_values) - 1: 205 to_be_removed.append(self.data[index]) 206 break 207 breakpoint = self.data[index] 208 for breakpoint in to_be_removed: 209 self.data.remove(breakpoint)210212 ranges, lower = [], self.attr_values[0] 213 self.sort() 214 for breakpoint in self.data: 215 mid = (self.attr_values[breakpoint] + self.attr_values[breakpoint + 1]) / 2.0 216 ranges.append(r.Range(lower, mid)) 217 lower = mid 218 ranges.append(r.Range(lower, self.attr_values[-1], True)) 219 return ranges
Home | Trees | Indices | Help |
|
---|
Generated by Epydoc 3.0beta1 on Wed May 16 22:47:58 2007 | http://epydoc.sourceforge.net |