Module sre_compile
[hide private]
[frames] | no frames]

Source Code for Module sre_compile

  1  # 
  2  # Secret Labs' Regular Expression Engine 
  3  # 
  4  # convert template to internal format 
  5  # 
  6  # Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved. 
  7  # 
  8  # See the sre.py file for information on usage and redistribution. 
  9  # 
 10   
 11  """Internal support module for sre""" 
 12   
 13  import _sre, sys 
 14   
 15  from sre_constants import * 
 16   
 17  assert _sre.MAGIC == MAGIC, "SRE module mismatch" 
 18   
 19  if _sre.CODESIZE == 2: 
 20      MAXCODE = 65535 
 21  else: 
 22      MAXCODE = 0xFFFFFFFFL 
 23   
24 -def _identityfunction(x):
25 return x
26
27 -def set(seq):
28 s = {} 29 for elem in seq: 30 s[elem] = 1 31 return s
32 33 _LITERAL_CODES = set([LITERAL, NOT_LITERAL]) 34 _REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT]) 35 _SUCCESS_CODES = set([SUCCESS, FAILURE]) 36 _ASSERT_CODES = set([ASSERT, ASSERT_NOT]) 37
38 -def _compile(code, pattern, flags):
39 # internal: compile a (sub)pattern 40 emit = code.append 41 _len = len 42 LITERAL_CODES = _LITERAL_CODES 43 REPEATING_CODES = _REPEATING_CODES 44 SUCCESS_CODES = _SUCCESS_CODES 45 ASSERT_CODES = _ASSERT_CODES 46 for op, av in pattern: 47 if op in LITERAL_CODES: 48 if flags & SRE_FLAG_IGNORECASE: 49 emit(OPCODES[OP_IGNORE[op]]) 50 emit(_sre.getlower(av, flags)) 51 else: 52 emit(OPCODES[op]) 53 emit(av) 54 elif op is IN: 55 if flags & SRE_FLAG_IGNORECASE: 56 emit(OPCODES[OP_IGNORE[op]]) 57 def fixup(literal, flags=flags): 58 return _sre.getlower(literal, flags)
59 else: 60 emit(OPCODES[op]) 61 fixup = _identityfunction 62 skip = _len(code); emit(0) 63 _compile_charset(av, flags, code, fixup) 64 code[skip] = _len(code) - skip 65 elif op is ANY: 66 if flags & SRE_FLAG_DOTALL: 67 emit(OPCODES[ANY_ALL]) 68 else: 69 emit(OPCODES[ANY]) 70 elif op in REPEATING_CODES: 71 if flags & SRE_FLAG_TEMPLATE: 72 raise error, "internal: unsupported template operator" 73 emit(OPCODES[REPEAT]) 74 skip = _len(code); emit(0) 75 emit(av[0]) 76 emit(av[1]) 77 _compile(code, av[2], flags) 78 emit(OPCODES[SUCCESS]) 79 code[skip] = _len(code) - skip 80 elif _simple(av) and op is not REPEAT: 81 if op is MAX_REPEAT: 82 emit(OPCODES[REPEAT_ONE]) 83 else: 84 emit(OPCODES[MIN_REPEAT_ONE]) 85 skip = _len(code); emit(0) 86 emit(av[0]) 87 emit(av[1]) 88 _compile(code, av[2], flags) 89 emit(OPCODES[SUCCESS]) 90 code[skip] = _len(code) - skip 91 else: 92 emit(OPCODES[REPEAT]) 93 skip = _len(code); emit(0) 94 emit(av[0]) 95 emit(av[1]) 96 _compile(code, av[2], flags) 97 code[skip] = _len(code) - skip 98 if op is MAX_REPEAT: 99 emit(OPCODES[MAX_UNTIL]) 100 else: 101 emit(OPCODES[MIN_UNTIL]) 102 elif op is SUBPATTERN: 103 if av[0]: 104 emit(OPCODES[MARK]) 105 emit((av[0]-1)*2) 106 # _compile_info(code, av[1], flags) 107 _compile(code, av[1], flags) 108 if av[0]: 109 emit(OPCODES[MARK]) 110 emit((av[0]-1)*2+1) 111 elif op in SUCCESS_CODES: 112 emit(OPCODES[op]) 113 elif op in ASSERT_CODES: 114 emit(OPCODES[op]) 115 skip = _len(code); emit(0) 116 if av[0] >= 0: 117 emit(0) # look ahead 118 else: 119 lo, hi = av[1].getwidth() 120 if lo != hi: 121 raise error, "look-behind requires fixed-width pattern" 122 emit(lo) # look behind 123 _compile(code, av[1], flags) 124 emit(OPCODES[SUCCESS]) 125 code[skip] = _len(code) - skip 126 elif op is CALL: 127 emit(OPCODES[op]) 128 skip = _len(code); emit(0) 129 _compile(code, av, flags) 130 emit(OPCODES[SUCCESS]) 131 code[skip] = _len(code) - skip 132 elif op is AT: 133 emit(OPCODES[op]) 134 if flags & SRE_FLAG_MULTILINE: 135 av = AT_MULTILINE.get(av, av) 136 if flags & SRE_FLAG_LOCALE: 137 av = AT_LOCALE.get(av, av) 138 elif flags & SRE_FLAG_UNICODE: 139 av = AT_UNICODE.get(av, av) 140 emit(ATCODES[av]) 141 elif op is BRANCH: 142 emit(OPCODES[op]) 143 tail = [] 144 tailappend = tail.append 145 for av in av[1]: 146 skip = _len(code); emit(0) 147 # _compile_info(code, av, flags) 148 _compile(code, av, flags) 149 emit(OPCODES[JUMP]) 150 tailappend(_len(code)); emit(0) 151 code[skip] = _len(code) - skip 152 emit(0) # end of branch 153 for tail in tail: 154 code[tail] = _len(code) - tail 155 elif op is CATEGORY: 156 emit(OPCODES[op]) 157 if flags & SRE_FLAG_LOCALE: 158 av = CH_LOCALE[av] 159 elif flags & SRE_FLAG_UNICODE: 160 av = CH_UNICODE[av] 161 emit(CHCODES[av]) 162 elif op is GROUPREF: 163 if flags & SRE_FLAG_IGNORECASE: 164 emit(OPCODES[OP_IGNORE[op]]) 165 else: 166 emit(OPCODES[op]) 167 emit(av-1) 168 elif op is GROUPREF_EXISTS: 169 emit(OPCODES[op]) 170 emit(av[0]-1) 171 skipyes = _len(code); emit(0) 172 _compile(code, av[1], flags) 173 if av[2]: 174 emit(OPCODES[JUMP]) 175 skipno = _len(code); emit(0) 176 code[skipyes] = _len(code) - skipyes + 1 177 _compile(code, av[2], flags) 178 code[skipno] = _len(code) - skipno 179 else: 180 code[skipyes] = _len(code) - skipyes + 1 181 else: 182 raise ValueError, ("unsupported operand type", op) 183
184 -def _compile_charset(charset, flags, code, fixup=None):
185 # compile charset subprogram 186 emit = code.append 187 if fixup is None: 188 fixup = _identityfunction 189 for op, av in _optimize_charset(charset, fixup): 190 emit(OPCODES[op]) 191 if op is NEGATE: 192 pass 193 elif op is LITERAL: 194 emit(fixup(av)) 195 elif op is RANGE: 196 emit(fixup(av[0])) 197 emit(fixup(av[1])) 198 elif op is CHARSET: 199 code.extend(av) 200 elif op is BIGCHARSET: 201 code.extend(av) 202 elif op is CATEGORY: 203 if flags & SRE_FLAG_LOCALE: 204 emit(CHCODES[CH_LOCALE[av]]) 205 elif flags & SRE_FLAG_UNICODE: 206 emit(CHCODES[CH_UNICODE[av]]) 207 else: 208 emit(CHCODES[av]) 209 else: 210 raise error, "internal: unsupported set operator" 211 emit(OPCODES[FAILURE])
212
213 -def _optimize_charset(charset, fixup):
214 # internal: optimize character set 215 out = [] 216 outappend = out.append 217 charmap = [0]*256 218 try: 219 for op, av in charset: 220 if op is NEGATE: 221 outappend((op, av)) 222 elif op is LITERAL: 223 charmap[fixup(av)] = 1 224 elif op is RANGE: 225 for i in range(fixup(av[0]), fixup(av[1])+1): 226 charmap[i] = 1 227 elif op is CATEGORY: 228 # XXX: could append to charmap tail 229 return charset # cannot compress 230 except IndexError: 231 # character set contains unicode characters 232 return _optimize_unicode(charset, fixup) 233 # compress character map 234 i = p = n = 0 235 runs = [] 236 runsappend = runs.append 237 for c in charmap: 238 if c: 239 if n == 0: 240 p = i 241 n = n + 1 242 elif n: 243 runsappend((p, n)) 244 n = 0 245 i = i + 1 246 if n: 247 runsappend((p, n)) 248 if len(runs) <= 2: 249 # use literal/range 250 for p, n in runs: 251 if n == 1: 252 outappend((LITERAL, p)) 253 else: 254 outappend((RANGE, (p, p+n-1))) 255 if len(out) < len(charset): 256 return out 257 else: 258 # use bitmap 259 data = _mk_bitmap(charmap) 260 outappend((CHARSET, data)) 261 return out 262 return charset
263
264 -def _mk_bitmap(bits):
265 data = [] 266 dataappend = data.append 267 if _sre.CODESIZE == 2: 268 start = (1, 0) 269 else: 270 start = (1L, 0L) 271 m, v = start 272 for c in bits: 273 if c: 274 v = v + m 275 m = m + m 276 if m > MAXCODE: 277 dataappend(v) 278 m, v = start 279 return data
280 281 # To represent a big charset, first a bitmap of all characters in the 282 # set is constructed. Then, this bitmap is sliced into chunks of 256 283 # characters, duplicate chunks are eliminitated, and each chunk is 284 # given a number. In the compiled expression, the charset is 285 # represented by a 16-bit word sequence, consisting of one word for 286 # the number of different chunks, a sequence of 256 bytes (128 words) 287 # of chunk numbers indexed by their original chunk position, and a 288 # sequence of chunks (16 words each). 289 290 # Compression is normally good: in a typical charset, large ranges of 291 # Unicode will be either completely excluded (e.g. if only cyrillic 292 # letters are to be matched), or completely included (e.g. if large 293 # subranges of Kanji match). These ranges will be represented by 294 # chunks of all one-bits or all zero-bits. 295 296 # Matching can be also done efficiently: the more significant byte of 297 # the Unicode character is an index into the chunk number, and the 298 # less significant byte is a bit index in the chunk (just like the 299 # CHARSET matching). 300 301 # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets 302 # of the basic multilingual plane; an efficient representation 303 # for all of UTF-16 has not yet been developed. This means, 304 # in particular, that negated charsets cannot be represented as 305 # bigcharsets. 306
307 -def _optimize_unicode(charset, fixup):
308 try: 309 import array 310 except ImportError: 311 return charset 312 charmap = [0]*65536 313 negate = 0 314 try: 315 for op, av in charset: 316 if op is NEGATE: 317 negate = 1 318 elif op is LITERAL: 319 charmap[fixup(av)] = 1 320 elif op is RANGE: 321 for i in xrange(fixup(av[0]), fixup(av[1])+1): 322 charmap[i] = 1 323 elif op is CATEGORY: 324 # XXX: could expand category 325 return charset # cannot compress 326 except IndexError: 327 # non-BMP characters 328 return charset 329 if negate: 330 if sys.maxunicode != 65535: 331 # XXX: negation does not work with big charsets 332 return charset 333 for i in xrange(65536): 334 charmap[i] = not charmap[i] 335 comps = {} 336 mapping = [0]*256 337 block = 0 338 data = [] 339 for i in xrange(256): 340 chunk = tuple(charmap[i*256:(i+1)*256]) 341 new = comps.setdefault(chunk, block) 342 mapping[i] = new 343 if new == block: 344 block = block + 1 345 data = data + _mk_bitmap(chunk) 346 header = [block] 347 if _sre.CODESIZE == 2: 348 code = 'H' 349 else: 350 code = 'I' 351 # Convert block indices to byte array of 256 bytes 352 mapping = array.array('b', mapping).tostring() 353 # Convert byte array to word array 354 mapping = array.array(code, mapping) 355 assert mapping.itemsize == _sre.CODESIZE 356 header = header + mapping.tolist() 357 data[0:0] = header 358 return [(BIGCHARSET, data)]
359
360 -def _simple(av):
361 # check if av is a "simple" operator 362 lo, hi = av[2].getwidth() 363 if lo == 0 and hi == MAXREPEAT: 364 raise error, "nothing to repeat" 365 return lo == hi == 1 and av[2][0][0] != SUBPATTERN
366
367 -def _compile_info(code, pattern, flags):
368 # internal: compile an info block. in the current version, 369 # this contains min/max pattern width, and an optional literal 370 # prefix or a character map 371 lo, hi = pattern.getwidth() 372 if lo == 0: 373 return # not worth it 374 # look for a literal prefix 375 prefix = [] 376 prefixappend = prefix.append 377 prefix_skip = 0 378 charset = [] # not used 379 charsetappend = charset.append 380 if not (flags & SRE_FLAG_IGNORECASE): 381 # look for literal prefix 382 for op, av in pattern.data: 383 if op is LITERAL: 384 if len(prefix) == prefix_skip: 385 prefix_skip = prefix_skip + 1 386 prefixappend(av) 387 elif op is SUBPATTERN and len(av[1]) == 1: 388 op, av = av[1][0] 389 if op is LITERAL: 390 prefixappend(av) 391 else: 392 break 393 else: 394 break 395 # if no prefix, look for charset prefix 396 if not prefix and pattern.data: 397 op, av = pattern.data[0] 398 if op is SUBPATTERN and av[1]: 399 op, av = av[1][0] 400 if op is LITERAL: 401 charsetappend((op, av)) 402 elif op is BRANCH: 403 c = [] 404 cappend = c.append 405 for p in av[1]: 406 if not p: 407 break 408 op, av = p[0] 409 if op is LITERAL: 410 cappend((op, av)) 411 else: 412 break 413 else: 414 charset = c 415 elif op is BRANCH: 416 c = [] 417 cappend = c.append 418 for p in av[1]: 419 if not p: 420 break 421 op, av = p[0] 422 if op is LITERAL: 423 cappend((op, av)) 424 else: 425 break 426 else: 427 charset = c 428 elif op is IN: 429 charset = av 430 ## if prefix: 431 ## print "*** PREFIX", prefix, prefix_skip 432 ## if charset: 433 ## print "*** CHARSET", charset 434 # add an info block 435 emit = code.append 436 emit(OPCODES[INFO]) 437 skip = len(code); emit(0) 438 # literal flag 439 mask = 0 440 if prefix: 441 mask = SRE_INFO_PREFIX 442 if len(prefix) == prefix_skip == len(pattern.data): 443 mask = mask + SRE_INFO_LITERAL 444 elif charset: 445 mask = mask + SRE_INFO_CHARSET 446 emit(mask) 447 # pattern length 448 if lo < MAXCODE: 449 emit(lo) 450 else: 451 emit(MAXCODE) 452 prefix = prefix[:MAXCODE] 453 if hi < MAXCODE: 454 emit(hi) 455 else: 456 emit(0) 457 # add literal prefix 458 if prefix: 459 emit(len(prefix)) # length 460 emit(prefix_skip) # skip 461 code.extend(prefix) 462 # generate overlap table 463 table = [-1] + ([0]*len(prefix)) 464 for i in xrange(len(prefix)): 465 table[i+1] = table[i]+1 466 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: 467 table[i+1] = table[table[i+1]-1]+1 468 code.extend(table[1:]) # don't store first entry 469 elif charset: 470 _compile_charset(charset, flags, code) 471 code[skip] = len(code) - skip
472 473 try: 474 unicode 475 except NameError: 476 STRING_TYPES = (type(""),) 477 else: 478 STRING_TYPES = (type(""), type(unicode(""))) 479
480 -def isstring(obj):
481 for tp in STRING_TYPES: 482 if isinstance(obj, tp): 483 return 1 484 return 0
485
486 -def _code(p, flags):
487 488 flags = p.pattern.flags | flags 489 code = [] 490 491 # compile info block 492 _compile_info(code, p, flags) 493 494 # compile the pattern 495 _compile(code, p.data, flags) 496 497 code.append(OPCODES[SUCCESS]) 498 499 return code
500
501 -def compile(p, flags=0):
502 # internal: convert pattern list to internal format 503 504 if isstring(p): 505 import sre_parse 506 pattern = p 507 p = sre_parse.parse(p, flags) 508 else: 509 pattern = None 510 511 code = _code(p, flags) 512 513 # print code 514 515 # XXX: <fl> get rid of this limitation! 516 if p.pattern.groups > 100: 517 raise AssertionError( 518 "sorry, but this version only supports 100 named groups" 519 ) 520 521 # map in either direction 522 groupindex = p.pattern.groupdict 523 indexgroup = [None] * p.pattern.groups 524 for k, i in groupindex.items(): 525 indexgroup[i] = k 526 527 return _sre.compile( 528 pattern, flags, code, 529 p.pattern.groups-1, 530 groupindex, indexgroup 531 )
532