from os.path import commonprefix from itertools import groupby import re def grpm(regex): return grp(regex, matching=True) def grp(regex, matching=False): return r'(' + (r'' if matching else r'?:') + regex + r')' def opt(regex): return grp(grp(regex) + r'?') def regex_opt_r(regexes): return grp(r'|'.join(regexes)) # The below functions were taken from the pygments package # (http://pygmenst.org), in particular pygments.regexopt and pygments.lexer # Some small modifications have been made. def regex_opt_inner(strings, open_paren): """Return a regex that matches any string in the sorted list of strings.""" close_paren = open_paren and ')' or '' # print strings, repr(open_paren) if not strings: # print '-> nothing left' return '' first = strings[0] if len(strings) == 1: # print '-> only 1 string' return open_paren + re.escape(first) + close_paren if not first: # print '-> first string empty' return open_paren + regex_opt_inner(strings[1:], '(?:') \ + '?' + close_paren if len(first) == 1: # multiple one-char strings? make a charset oneletter = [] rest = [] for s in strings: if len(s) == 1: oneletter.append(s) else: rest.append(s) if len(oneletter) > 1: # do we have more than one oneletter string? if rest: # print '-> 1-character + rest' return open_paren + regex_opt_inner(rest, '') + '|' \ + make_charset(oneletter) + close_paren # print '-> only 1-character' return make_charset(oneletter) prefix = commonprefix(strings) if prefix: plen = len(prefix) # we have a prefix for all strings # print '-> prefix:', prefix return open_paren + re.escape(prefix) \ + regex_opt_inner([s[plen:] for s in strings], '(?:') \ + close_paren # is there a suffix? strings_rev = [s[::-1] for s in strings] suffix = commonprefix(strings_rev) if suffix: slen = len(suffix) # print '-> suffix:', suffix[::-1] return open_paren \ + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \ + re.escape(suffix[::-1]) + close_paren # recurse on common 1-string prefixes # print '-> last resort' return open_paren + \ '|'.join(regex_opt_inner(list(group[1]), '') for group in groupby(strings, lambda s: s[0] == first[0])) \ + close_paren def regex_opt(strings, prefix='', suffix=''): """Return a compiled regex that matches any string in the given list. The strings to match must be literal strings, not regexes. They will be regex-escaped. *prefix* and *suffix* are pre- and appended to the final regex. """ strings = sorted(strings) return prefix + regex_opt_inner(strings, '(?:') + suffix ## From here it is own work again # RFC 2396 IPv4address = grp(r'(?:\d{1,3}\.){3}\d{1,3}') reserved = grp(r'[;\/?:@&=+$,]') alphanum = grp(r'[\da-zA-Z]') mark = grp(r'[\-_\.!~\*\'\(\)]') hex = grp(r'[\da-fA-F]') unreserved = regex_opt_r([alphanum, mark]) escaped = grp(r'%' + hex + hex) pchar = regex_opt_r([unreserved, escaped, r'[:@&=+$,]']) param = grp(pchar + r'*') segment = grp(pchar + r'*' + grp(r';' + param) + r'*') path_segments = grp(segment + grp(r'\/' + segment) + r'*') abs_path = grp(r'\/' + path_segments) scheme = grp(r'[a-zA-Z](?:[a-zA-Z\d+\-\.]*)') userinfo = grp(regex_opt_r([unreserved, escaped, r'[;:&=+$,]']) + r'*') domainlabel = grp(r'[a-zA-Z\d]|(?:[a-zA-Z\d](?:[a-zA-Z\d\-])*[a-zA-Z\d])') toplabel = grp(r'[a-zA-Z]|(?:[a-zA-Z](?:[a-zA-Z\d\-])*[a-zA-Z\d])') hostname = grp(opt(domainlabel + r'\.') + r'*' + toplabel + opt(r'\.')) host = regex_opt_r([hostname, IPv4address]) port = r'\d+' hostport = grp(host + opt(r':' + port)) server = opt(opt(userinfo + r'@') + hostport) reg_name = grp(regex_opt_r([unreserved, escaped, r'[;:&=+]']) + r'*') authority = regex_opt_r([server, reg_name]) net_path = grp(r'\/\/' + authority + opt(abs_path)) hier_part = regex_opt_r([net_path, abs_path]) uric = regex_opt_r([reserved, unreserved, escaped]) uric_no_slash = regex_opt_r([unreserved, escaped, r'[;?:@&=+$,]']) opaque_part = grp(uric_no_slash + grp(uric) + r'*') absoluteURI = grp(scheme + r':' + regex_opt_r([hier_part, opaque_part])) # RFC 2616 CTL = r'[\x00-\x1f\x7f]' CR = r'\r' LF = r'\n' CRLF = CR + LF HT = r'\t' SP = r' ' LWS = grp(opt(CRLF) + regex_opt_r([SP, HT])) TEXT = grp(r'[^\x00-\x1f\x7f]|' + LWS) TEXT_NO_LWS = r'[^\x00-\x1f\x7f \t]' separator = r'[\(\)<>@,;:\\"\/\[\]?=\{\} \t]' token = r'[^\x00-\x1f\(\)<>@,;:\\"\/\[\]?=\{\} \t]+' qdtext = r'^\x00-\x08\x0b-\x0c\x0e-\x1f\x7f"]' quotedPair = r'\\[\x00-\x7f]' quotedString = grp(r'"' + regex_opt_r([qdtext, quotedPair]) + r'*"') qvalue = regex_opt_r([r'0(?:\.\d{0,3})?', r'1(?:\.0{0,3})?']) HTTPVersion = r'HTTP\/\d\.\d' Method = regex_opt(['OPTIONS', 'GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE', 'CONNECT']) RequestURI = regex_opt_r([r'\*', absoluteURI, abs_path, authority]) RequestLine = grp(grpm(Method) + SP + grpm(RequestURI) + SP + grpm(HTTPVersion) + CRLF) StatusCode = r'\d{3}' ReasonPhrase = r'[^\r\n]*' StatusLine = grp(grpm(HTTPVersion) + SP + grpm(StatusCode) + SP + grpm(ReasonPhrase) + CRLF) FieldName = token FieldContent = regex_opt_r([TEXT_NO_LWS + TEXT + r'*(?!' + LWS + r')']) FieldValue = grp(regex_opt_r([grp(FieldContent), LWS]) + r'*') MessageHeader = grp(grpm(FieldName) + r':' + grpm(FieldValue)) ETagSplit = grp(r',' + LWS + r'*') EncodingSplit = ETagSplit contentCoding = token coding = regex_opt_r([contentCoding, r'\*']) AcceptEncodingValue = grp(grpm(coding) + grp(r';q=' + grpm(qvalue)) + r'?')