diff options
Diffstat (limited to 'project1/proj1_s4498062/webhttp/regexes.py')
-rw-r--r-- | project1/proj1_s4498062/webhttp/regexes.py | 151 |
1 files changed, 151 insertions, 0 deletions
diff --git a/project1/proj1_s4498062/webhttp/regexes.py b/project1/proj1_s4498062/webhttp/regexes.py new file mode 100644 index 0000000..755bb9f --- /dev/null +++ b/project1/proj1_s4498062/webhttp/regexes.py @@ -0,0 +1,151 @@ +from os.path import commonprefix +from itertools import groupby +import re + +def grpm(regex): + return grp(regex, matching=True) + +def grp(regex, matching=False): + return r'(' + (r'' if matching else r'?:') + regex + r')' + +def opt(regex): + return grp(grp(regex) + r'?') + +def regex_opt_r(regexes): + return grp(r'|'.join(regexes)) + +# The below functions were taken from the pygments package +# (http://pygmenst.org), in particular pygments.regexopt and pygments.lexer +# Some small modifications have been made. +def regex_opt_inner(strings, open_paren): + """Return a regex that matches any string in the sorted list of strings.""" + close_paren = open_paren and ')' or '' + # print strings, repr(open_paren) + if not strings: + # print '-> nothing left' + return '' + first = strings[0] + if len(strings) == 1: + # print '-> only 1 string' + return open_paren + re.escape(first) + close_paren + if not first: + # print '-> first string empty' + return open_paren + regex_opt_inner(strings[1:], '(?:') \ + + '?' + close_paren + if len(first) == 1: + # multiple one-char strings? make a charset + oneletter = [] + rest = [] + for s in strings: + if len(s) == 1: + oneletter.append(s) + else: + rest.append(s) + if len(oneletter) > 1: # do we have more than one oneletter string? + if rest: + # print '-> 1-character + rest' + return open_paren + regex_opt_inner(rest, '') + '|' \ + + make_charset(oneletter) + close_paren + # print '-> only 1-character' + return make_charset(oneletter) + prefix = commonprefix(strings) + if prefix: + plen = len(prefix) + # we have a prefix for all strings + # print '-> prefix:', prefix + return open_paren + re.escape(prefix) \ + + regex_opt_inner([s[plen:] for s in strings], '(?:') \ + + close_paren + # is there a suffix? + strings_rev = [s[::-1] for s in strings] + suffix = commonprefix(strings_rev) + if suffix: + slen = len(suffix) + # print '-> suffix:', suffix[::-1] + return open_paren \ + + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \ + + re.escape(suffix[::-1]) + close_paren + # recurse on common 1-string prefixes + # print '-> last resort' + return open_paren + \ + '|'.join(regex_opt_inner(list(group[1]), '') + for group in groupby(strings, lambda s: s[0] == first[0])) \ + + close_paren + +def regex_opt(strings, prefix='', suffix=''): + """Return a compiled regex that matches any string in the given list. + + The strings to match must be literal strings, not regexes. They will be + regex-escaped. + + *prefix* and *suffix* are pre- and appended to the final regex. + """ + strings = sorted(strings) + return prefix + regex_opt_inner(strings, '(?:') + suffix + +## From here it is own work again +# RFC 2396 +IPv4address = grp(r'(?:\d{1,3}\.){3}\d{1,3}') +reserved = grp(r'[;\/?:@&=+$,]') +alphanum = grp(r'[\da-zA-Z]') +mark = grp(r'[\-_\.!~\*\'\(\)]') +hex = grp(r'[\da-fA-F]') +unreserved = regex_opt_r([alphanum, mark]) +escaped = grp(r'%' + hex + hex) +pchar = regex_opt_r([unreserved, escaped, r'[:@&=+$,]']) +param = grp(pchar + r'*') +segment = grp(pchar + r'*' + grp(r';' + param) + r'*') +path_segments = grp(segment + grp(r'\/' + segment) + r'*') +abs_path = grp(r'\/' + path_segments) +scheme = grp(r'[a-zA-Z](?:[a-zA-Z\d+\-\.]*)') +userinfo = grp(regex_opt_r([unreserved, escaped, r'[;:&=+$,]']) + r'*') +domainlabel = grp(r'[a-zA-Z\d]|(?:[a-zA-Z\d](?:[a-zA-Z\d\-])*[a-zA-Z\d])') +toplabel = grp(r'[a-zA-Z]|(?:[a-zA-Z](?:[a-zA-Z\d\-])*[a-zA-Z\d])') +hostname = grp(opt(domainlabel + r'\.') + r'*' + toplabel + opt(r'\.')) +host = regex_opt_r([hostname, IPv4address]) +port = r'\d+' +hostport = grp(host + opt(r':' + port)) +server = opt(opt(userinfo + r'@') + hostport) +reg_name = grp(regex_opt_r([unreserved, escaped, r'[;:&=+]']) + r'*') +authority = regex_opt_r([server, reg_name]) +net_path = grp(r'\/\/' + authority + opt(abs_path)) +hier_part = regex_opt_r([net_path, abs_path]) +uric = regex_opt_r([reserved, unreserved, escaped]) +uric_no_slash = regex_opt_r([unreserved, escaped, r'[;?:@&=+$,]']) +opaque_part = grp(uric_no_slash + grp(uric) + r'*') +absoluteURI = grp(scheme + r':' + regex_opt_r([hier_part, opaque_part])) + +# RFC 2616 +CTL = r'[\x00-\x1f\x7f]' +CR = r'\r' +LF = r'\n' +CRLF = CR + LF +HT = r'\t' +SP = r' ' +LWS = grp(opt(CRLF) + regex_opt_r([SP, HT])) +TEXT = grp(r'[^\x00-\x1f\x7f]|' + LWS) +TEXT_NO_LWS = r'[^\x00-\x1f\x7f \t]' + +separator = r'[\(\)<>@,;:\\"\/\[\]?=\{\} \t]' +token = r'[^\x00-\x1f\(\)<>@,;:\\"\/\[\]?=\{\} \t]+' +qdtext = r'^\x00-\x08\x0b-\x0c\x0e-\x1f\x7f"]' +quotedPair = r'\\[\x00-\x7f]' +quotedString = grp(r'"' + regex_opt_r([qdtext, quotedPair]) + r'*"') + +HTTPVersion = r'HTTP\/\d\.\d' +Method = regex_opt(['OPTIONS', 'GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE', + 'CONNECT']) +RequestURI = regex_opt_r([r'\*', absoluteURI, abs_path, authority]) +RequestLine = grp(grpm(Method) + SP + grpm(RequestURI) + SP + + grpm(HTTPVersion) + CRLF) + +StatusCode = r'\d{3}' +ReasonPhrase = r'[^\r\n]*' +StatusLine = grp(grpm(HTTPVersion) + SP + grpm(StatusCode) + SP + + grpm(ReasonPhrase) + CRLF) + +FieldName = token +FieldContent = regex_opt_r([TEXT_NO_LWS + TEXT + r'*(?!' + LWS + r')']) +FieldValue = grp(regex_opt_r([grp(FieldContent), LWS]) + r'*') +MessageHeader = grp(grpm(FieldName) + r':' + grpm(FieldValue)) + |