Package nltk_lite :: Package contrib :: Package classifier :: Module format
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.classifier.format

  1  from nltk_lite.contrib.classifier import cfile, item, attribute as a, instance as ins, instances as inss 
  2  from nltk_lite.contrib.classifier.exceptions import systemerror as se, filenotfounderror as fnf 
  3   
4 -class FormatI:
5 - def __init__(self, name):
6 self.name = name
7
8 - def get_attributes(self, path):
9 return AssertionError()
10
11 - def get_training_instances(self, path):
12 return AssertionError()
13
14 - def get_test_instances(self, path):
15 return AssertionError()
16
17 - def get_gold_instances(self, path):
18 return AssertionError()
19
20 - def get_klass(self, path):
21 return AssertionError()
22
23 - def write_training_to_file(self, training, path):
24 return AssertionError()
25
26 - def write_test_to_file(self, training, path):
27 return AssertionError()
28
29 - def write_gold_to_file(self, instances, path):
30 return AssertionError()
31
32 - def write_metadata_to_file(self, attributes, klass, path):
33 return AssertionError()
34
35 -class C45Format(FormatI):
36 DATA = 'data' 37 TEST = 'test' 38 GOLD = 'gold' 39 NAMES = 'names' 40
41 - def __init__(self):
42 FormatI.__init__(self, "c45")
43
44 - def get_attributes(self, path):
45 lines = self.__get_lines(path, self.NAMES) 46 index,attributes = 0, [] 47 for line in lines: 48 nameitem = item.NameItem(line) 49 processed = nameitem.processed() 50 if not len(processed) == 0 and nameitem.isAttribute(): 51 attributes.append(a.Attribute(self.get_name(processed), self.get_values(processed), index)) 52 index += 1 53 return a.Attributes(attributes)
54
55 - def get_training_instances(self, path):
56 lines = self.__get_lines(path, self.DATA) 57 instances = [] 58 for line in lines: 59 values = self.__get_comma_sep_values(line) 60 if values is not None: 61 instances.append(ins.TrainingInstance(values[:-1], values[-1])) 62 return inss.TrainingInstances(instances)
63
64 - def get_test_instances(self, path):
65 instances = [] 66 lines = self.__get_lines(path, self.TEST) 67 for line in lines: 68 values = self.__get_comma_sep_values(line) 69 if values is not None: 70 instances.append(ins.TestInstance(values)) 71 return inss.TestInstances(instances)
72
73 - def get_gold_instances(self, path):
74 instances = [] 75 lines = self.__get_lines(path, self.GOLD) 76 for line in lines: 77 values = self.__get_comma_sep_values(line) 78 if values is not None: 79 instances.append(ins.GoldInstance(values[:-1], values[-1])) 80 return inss.GoldInstances(instances)
81
82 - def get_klass(self, path):
83 lines = self.__get_lines(path, self.NAMES) 84 values = item.NameItem(lines[0]).processed().split(',') 85 return values
86
87 - def write_training_to_file(self, instances, path):
88 return self.write_to_file(path, self.DATA, instances, lambda instance: instance.attr_values_as_str() + ',' + str(instance.klass_value))
89
90 - def write_test_to_file(self, instances, path):
91 return self.write_to_file(path, self.TEST, instances, lambda instance: instance.attr_values_as_str() + ',' + str(instance.classified_klass))
92
93 - def write_gold_to_file(self, instances, path):
94 return self.write_to_file(path, self.GOLD, instances, lambda instance: instance.attr_values_as_str() + ',' + str(instance.klass_value) + ',' + str(instance.classified_klass))
95
96 - def write_metadata_to_file(self, attributes, klass, path):
97 new_file = cfile.File(path, self.NAMES) 98 new_file.create(True) 99 klass_values = '' 100 for value in klass_values: 101 klass_values += str(value) + ',' 102 lines = [klass_values[:-1] + '.'] 103 for attribute in attributes: 104 lines.append(attribute.name + ':' + attribute.values_as_str() + '.') 105 new_file.write(lines) 106 return path + cfile.DOT + self.NAMES
107
108 - def write_to_file(self, path, extension, instances, method):
109 new_file = cfile.File(path, extension) 110 new_file.create(True) 111 lines = [] 112 for instance in instances: 113 lines.append(method(instance)) 114 new_file.write(lines) 115 return path + cfile.DOT + extension
116
117 - def __get_comma_sep_values(self, line):
118 _line = item.Item(line).stripNewLineAndWhitespace() 119 if not len(_line) == 0: 120 return _line.split(',') 121 return None
122
123 - def __get_lines(self, path, ext):
124 if path is None: 125 raise se.SystemError('Cannot open file. File name not specified.') 126 return cfile.File(path, ext).for_each_line(lambda line: line)
127
128 - def get_name(self, line):
129 return line[:self.__pos_of_colon(line)]
130
131 - def get_values(self, line):
132 return line[self.__pos_of_colon(line) + 1:].split(',')
133
134 - def __pos_of_colon(self, line):
135 return line.find(':')
136 137 C45_FORMAT = C45Format() 138