1
2
3
4
5
6
7
8 """
9 This modules provides functionality for parsing and manipulating the
10 contents of a Toolbox lexicon without reference to its metadata.
11 """
12
13 import os, re, sys
14 from nltk_lite.corpora import get_basedir
15 from nltk_lite.corpora.toolbox import StandardFormat
16 from utilities import Field, SequentialDictionary
17
18
20
21 """
22 This class represents a Toolbox lexicon, which consists of an
23 optional header and one or more Entry objects, saved in a dictionary
24 whose keys are passed as a parameter to the parse() method.
25 """
26
28 """
29 This method construct a Lexicon object with a header and a dictionary of
30 entries.
31 """
32 self._key_fields = ['lx']
33 self._header = ''
34 self._entries = {}
35 self._file = file
36
38 """
39 This method defines the string representation of a Lexicon object
40 """
41 s = "%s\n" % self.get_header()
42 for e in self.get_entries():
43 s = "%s%s\n" % (s, e)
44 return s
45
47 """
48 This method sets the raw text of the header.
49 @param header: header (as raw text)
50 @type header: string
51 """
52 self._header = header
53
55 """
56 This method obtains the raw text of the header.
57 @return: raw header
58 @rtype: string
59 """
60 return self._header
61
63 """
64 This method obtains all of the entries found in a
65 parsed Toolbox lexicon.
66
67 @return: all of the entries in the Lexicon
68 @rtype: list of Entry objects
69 """
70 keys = self._entries.keys()
71 keys.sort()
72 for k in keys :
73 v = self._entries[k]
74 for e in v :
75 yield e
76
77 - def add_entry(self, entry, unique=False):
78 """
79 This method adds an Entry object to a Lexicon object. It adds the
80 entry to the Lexicon keyed by the values of the fields specified
81 by the I{key_fields} argument.
82
83 @param entry: a parsed entry from a Toolbox lexicon
84 @type entry: Entry object
85 @param unique: raise exception if entry key already exists
86 @type unique: boolean
87 """
88 key = ""
89 for field_marker in self._key_fields:
90 f = entry.get_field(field_marker)
91 if f:
92 values = f.get_values("/")
93 key = key + "-" + values
94 else:
95
96
97 pass
98 if self._entries.has_key(key) :
99 if unique :
100 msg = "Non-unique entry! \nEntry: \n%s\nKey Fields: %s\nKey: '%s'\n" % (entry, self._key_fields, key)
101 raise ValueError, msg
102 else :
103 self._entries[key] = []
104
105 self._entries[key].append(entry)
106
107
108 - def parse(self,
109 head_field_marker = 'lx',
110 subentry_field_marker = None,
111 key_fields = None,
112 unique_entry = True,
113 unique_subentry = False):
114 """
115 This method parses a Toolbox file in a Lexicon object. It will also parse
116 subentries provided that the field marker identifying subentries is passed to it.
117
118 @param head_field_marker: field marker that identifies the start of an entry
119 @type head_field_marker: string
120 @param key_fields: the field(s) to which entries are keyed
121 @type key_fields: list of strings
122 @param subentry_field_marker: field marker that identifies subentries
123 @type subentry_field_marker: string
124 @param unique_entry: raise warning if entries are non-unique according
125 to I{key_fields} parameter
126 @type unique_entry: boolean
127 @param unique_subentry: raise warning if entries are non-unique according to
128 I{key_fields} parameter
129 @type unique_subentry: boolean
130 @return: a parsed Lexicon object
131 @rtype: dictionary object
132 """
133
134 if key_fields :
135 self._key_fields = key_fields
136
137
138 inside_entry = False
139 inside_subentry = False
140 e = None
141 se = None
142
143 self.open(self._file)
144 for f in self.raw_fields() :
145 fmarker, fvalue = f
146
147 if fmarker.startswith("_") :
148
149 pass
150 elif fmarker == head_field_marker :
151 inside_entry = True
152 inside_subentry = False
153 if e :
154 self.add_entry(e, unique_entry)
155 e = Entry()
156 elif subentry_field_marker and fmarker == subentry_field_marker :
157 inside_subentry = True
158 if se :
159 e.add_subentry(se)
160 se = Entry()
161
162 if inside_subentry :
163 se.add_field(fmarker, fvalue)
164 elif inside_entry :
165 e.add_field(fmarker, fvalue)
166 else :
167 pass
168
169 if e :
170 self.add_entry(e, unique_entry)
171 self.close()
172
174 """
175 This class represents an entry (record) from a Toolbox lexicon. Each entry
176 consists of a collection of fields, stored as a special type of dictionary
177 which keeps track of the sequence in which its keys were entered.
178 """
179
180 - def __init__(self):
181 """
182 This method constructs a new Entry object.
183 """
184 self._fields = SequentialDictionary()
185 self._rawText = ""
186 self._number = None
187 self._subentries = None
188
190 """
191 This method defines the string representation of an entry.
192
193 @rtype: string
194 @return: an entry as a string in Standard Format
195 """
196 s = ""
197 fields = self.get_fields()
198 for fm, fvs in self._fields.items():
199 for fv in fvs:
200 s = s + "\n\\%s %s" % (fm, fv)
201 return s
202
203 - def set_raw_text(self, rawText):
204 """
205 This method provides access to the raw text from which the
206 Entry object was parsed.
207
208 @param rawText: raw Toolbox text from which entry was parsed
209 @type rawText: string
210 """
211 self._rawText = rawText
212
213 - def get_raw_text(self):
214 """
215 This method sets the raw text from which the Entry object was parsed.
216
217 @rtype: string
218 """
219 return self._rawText
220
221 - def get_subentries(self):
222 """
223 This method obtains all of the subentries for an entry.
224
225 @rtype: list of Entry objects
226 @returns: all of the subentries of an entry
227 """
228 return self._subentries
229
230 - def add_subentry(self, subentry):
231 """
232 This method adds to an entry a subentry, which is simply another
233 Entry object.
234
235 @param subentry: subentry
236 @type subentry: Entry object :
237 """
238 if not self._subentries:
239 self._subentries = []
240 self._subentries.append(subentry)
241
242 - def set_number(self, number):
243 """
244 This method sets the position of the entry in
245 the dictionary as a cardinal number.
246
247 @param number: number of entry
248 @type number: integer
249 """
250 self._number = number
251
252 - def get_number(self):
253 """
254 This method obtains the position of the entry in the dictionary
255 as a cardinal number.
256
257 @rtype: integer
258 """
259 return self._number
260
261 - def get_fields(self):
262 """
263 This method obtains all of the fields found in the Entry object.
264
265 @rtype: list of Field objects
266 """
267 return self._fields.values()
268
270 """
271 This method obtains of the field markers found in the Entry object.
272
273 @return: the field markers of an entry
274 @rtype: list
275 """
276 return self._fields.keys()
277
278 - def get_values_by_marker(self, field_marker, sep=None) :
279 return self.get_field_values_by_field_marker(field_marker, sep)
280
281 - def get_field_values_by_field_marker(self, field_marker, sep=None):
282 """
283 This method returns all of the field values for a given field marker.
284 If the L(sep) is set, it will return a string; otherwise, it will
285 return a list of Field objects.
286
287 @param field_marker: marker of desired field
288 @type field_marker: string
289 @param sep: separator for field values
290 @type sep: string
291 @rtype: string (if sep); otherwise, list of Field objects
292 """
293 try:
294 values = self._fields[field_marker]
295 if sep == None:
296 return values
297 else:
298 return sep.join(values)
299 except KeyError:
300 return None
301
302 - def get_field_as_string(self,
303 field_marker,
304 join_string=""):
305 """
306 This method returns a particular field given a field marker.
307 Returns a blank string if field is not found.
308
309 @param field_marker: marker of desired field
310 @type field_marker: string
311 @param join_string: string used to join field values (default to blank string)
312 @type join_string: string
313 @rtype: string
314 """
315 try:
316 return join_string.join(self._fields[field_marker])
317 except KeyError:
318 return ""
319
320 - def get_field(self, fieldMarker):
321 """
322 This method returns a particular field given a field marker.
323
324 @param fieldMarker: marker of desired field
325 @type fieldMarker: string
326 @rtype: Field object
327 """
328 try:
329 return Field(fieldMarker, self._fields[fieldMarker])
330 except KeyError:
331 return None
332
333 - def set_field(self, fieldMarker, field):
334 """
335 This method sets a field, given a marker and its associated data.
336
337 @param fieldMarker: field marker to set
338 @type fieldMarker: string
339 @param field : field object associated with field marker
340 @type field : Field
341 """
342 fvs = []
343 fvs.append(fieldData)
344 self._fields[fieldMarker] = fvs
345
346 - def set_field_values(self, fieldMarker, fieldValues):
347 """
348 This method sets all of the values associated with a field.
349
350 @param fieldMarker: field marker to set
351 @type fieldMarker: string
352 @param fieldValues: list of field values
353 @type fieldValues: list
354 """
355 self._fields[fieldMarker] = fieldValues
356
357 - def add_field(self, marker, value):
358 """
359 This method adds a field to an entry if it does not already exist
360 and adds a new value to the field of an entry if it does.
361
362 @param marker: field marker
363 @type marker: string
364 @param value : field value
365 @type value : string
366 """
367 if self._fields.has_key(marker):
368 fvs = self._fields[marker]
369 fvs.append(value)
370 else:
371 fvs = []
372 fvs.append(value)
373 self._fields[marker] = fvs
374
375 - def remove_field(self, fieldMarker):
376 """
377 This method removes from an entry every field for a given
378 field marker. It will not raise an error if the specified field
379 does not exist.
380
381 @param fieldMarker: field marker to be deleted
382 @type fieldMarker: string
383 """
384 if self._fields.has_key(fieldMarker):
385 del self._fields[fieldMarker]
386
388 path = os.path.join(get_basedir(), "toolbox", "rotokas.dic")
389 l = Lexicon(path)
390 l.parse(key_fields=['lx','ps','sn'], unique_entry=False)
391 h = l.get_header()
392 for e in l.get_entries() :
393 print "<%s><%s><%s>" % (e.get_field_as_string("lx", ""),
394 e.get_field_as_string("ps", ""),
395 e.get_field_as_string("sn", ""))
396
397 if __name__ == '__main__':
398 demo()
399