summaryrefslogtreecommitdiff
path: root/project1/proj1_s4498062/webhttp/regexes.py
diff options
context:
space:
mode:
Diffstat (limited to 'project1/proj1_s4498062/webhttp/regexes.py')
-rw-r--r--project1/proj1_s4498062/webhttp/regexes.py151
1 files changed, 151 insertions, 0 deletions
diff --git a/project1/proj1_s4498062/webhttp/regexes.py b/project1/proj1_s4498062/webhttp/regexes.py
new file mode 100644
index 0000000..755bb9f
--- /dev/null
+++ b/project1/proj1_s4498062/webhttp/regexes.py
@@ -0,0 +1,151 @@
+from os.path import commonprefix
+from itertools import groupby
+import re
+
+def grpm(regex):
+ return grp(regex, matching=True)
+
+def grp(regex, matching=False):
+ return r'(' + (r'' if matching else r'?:') + regex + r')'
+
+def opt(regex):
+ return grp(grp(regex) + r'?')
+
+def regex_opt_r(regexes):
+ return grp(r'|'.join(regexes))
+
+# The below functions were taken from the pygments package
+# (http://pygmenst.org), in particular pygments.regexopt and pygments.lexer
+# Some small modifications have been made.
+def regex_opt_inner(strings, open_paren):
+ """Return a regex that matches any string in the sorted list of strings."""
+ close_paren = open_paren and ')' or ''
+ # print strings, repr(open_paren)
+ if not strings:
+ # print '-> nothing left'
+ return ''
+ first = strings[0]
+ if len(strings) == 1:
+ # print '-> only 1 string'
+ return open_paren + re.escape(first) + close_paren
+ if not first:
+ # print '-> first string empty'
+ return open_paren + regex_opt_inner(strings[1:], '(?:') \
+ + '?' + close_paren
+ if len(first) == 1:
+ # multiple one-char strings? make a charset
+ oneletter = []
+ rest = []
+ for s in strings:
+ if len(s) == 1:
+ oneletter.append(s)
+ else:
+ rest.append(s)
+ if len(oneletter) > 1: # do we have more than one oneletter string?
+ if rest:
+ # print '-> 1-character + rest'
+ return open_paren + regex_opt_inner(rest, '') + '|' \
+ + make_charset(oneletter) + close_paren
+ # print '-> only 1-character'
+ return make_charset(oneletter)
+ prefix = commonprefix(strings)
+ if prefix:
+ plen = len(prefix)
+ # we have a prefix for all strings
+ # print '-> prefix:', prefix
+ return open_paren + re.escape(prefix) \
+ + regex_opt_inner([s[plen:] for s in strings], '(?:') \
+ + close_paren
+ # is there a suffix?
+ strings_rev = [s[::-1] for s in strings]
+ suffix = commonprefix(strings_rev)
+ if suffix:
+ slen = len(suffix)
+ # print '-> suffix:', suffix[::-1]
+ return open_paren \
+ + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \
+ + re.escape(suffix[::-1]) + close_paren
+ # recurse on common 1-string prefixes
+ # print '-> last resort'
+ return open_paren + \
+ '|'.join(regex_opt_inner(list(group[1]), '')
+ for group in groupby(strings, lambda s: s[0] == first[0])) \
+ + close_paren
+
+def regex_opt(strings, prefix='', suffix=''):
+ """Return a compiled regex that matches any string in the given list.
+
+ The strings to match must be literal strings, not regexes. They will be
+ regex-escaped.
+
+ *prefix* and *suffix* are pre- and appended to the final regex.
+ """
+ strings = sorted(strings)
+ return prefix + regex_opt_inner(strings, '(?:') + suffix
+
+## From here it is own work again
+# RFC 2396
+IPv4address = grp(r'(?:\d{1,3}\.){3}\d{1,3}')
+reserved = grp(r'[;\/?:@&=+$,]')
+alphanum = grp(r'[\da-zA-Z]')
+mark = grp(r'[\-_\.!~\*\'\(\)]')
+hex = grp(r'[\da-fA-F]')
+unreserved = regex_opt_r([alphanum, mark])
+escaped = grp(r'%' + hex + hex)
+pchar = regex_opt_r([unreserved, escaped, r'[:@&=+$,]'])
+param = grp(pchar + r'*')
+segment = grp(pchar + r'*' + grp(r';' + param) + r'*')
+path_segments = grp(segment + grp(r'\/' + segment) + r'*')
+abs_path = grp(r'\/' + path_segments)
+scheme = grp(r'[a-zA-Z](?:[a-zA-Z\d+\-\.]*)')
+userinfo = grp(regex_opt_r([unreserved, escaped, r'[;:&=+$,]']) + r'*')
+domainlabel = grp(r'[a-zA-Z\d]|(?:[a-zA-Z\d](?:[a-zA-Z\d\-])*[a-zA-Z\d])')
+toplabel = grp(r'[a-zA-Z]|(?:[a-zA-Z](?:[a-zA-Z\d\-])*[a-zA-Z\d])')
+hostname = grp(opt(domainlabel + r'\.') + r'*' + toplabel + opt(r'\.'))
+host = regex_opt_r([hostname, IPv4address])
+port = r'\d+'
+hostport = grp(host + opt(r':' + port))
+server = opt(opt(userinfo + r'@') + hostport)
+reg_name = grp(regex_opt_r([unreserved, escaped, r'[;:&=+]']) + r'*')
+authority = regex_opt_r([server, reg_name])
+net_path = grp(r'\/\/' + authority + opt(abs_path))
+hier_part = regex_opt_r([net_path, abs_path])
+uric = regex_opt_r([reserved, unreserved, escaped])
+uric_no_slash = regex_opt_r([unreserved, escaped, r'[;?:@&=+$,]'])
+opaque_part = grp(uric_no_slash + grp(uric) + r'*')
+absoluteURI = grp(scheme + r':' + regex_opt_r([hier_part, opaque_part]))
+
+# RFC 2616
+CTL = r'[\x00-\x1f\x7f]'
+CR = r'\r'
+LF = r'\n'
+CRLF = CR + LF
+HT = r'\t'
+SP = r' '
+LWS = grp(opt(CRLF) + regex_opt_r([SP, HT]))
+TEXT = grp(r'[^\x00-\x1f\x7f]|' + LWS)
+TEXT_NO_LWS = r'[^\x00-\x1f\x7f \t]'
+
+separator = r'[\(\)<>@,;:\\"\/\[\]?=\{\} \t]'
+token = r'[^\x00-\x1f\(\)<>@,;:\\"\/\[\]?=\{\} \t]+'
+qdtext = r'^\x00-\x08\x0b-\x0c\x0e-\x1f\x7f"]'
+quotedPair = r'\\[\x00-\x7f]'
+quotedString = grp(r'"' + regex_opt_r([qdtext, quotedPair]) + r'*"')
+
+HTTPVersion = r'HTTP\/\d\.\d'
+Method = regex_opt(['OPTIONS', 'GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE',
+ 'CONNECT'])
+RequestURI = regex_opt_r([r'\*', absoluteURI, abs_path, authority])
+RequestLine = grp(grpm(Method) + SP + grpm(RequestURI) + SP +
+ grpm(HTTPVersion) + CRLF)
+
+StatusCode = r'\d{3}'
+ReasonPhrase = r'[^\r\n]*'
+StatusLine = grp(grpm(HTTPVersion) + SP + grpm(StatusCode) + SP +
+ grpm(ReasonPhrase) + CRLF)
+
+FieldName = token
+FieldContent = regex_opt_r([TEXT_NO_LWS + TEXT + r'*(?!' + LWS + r')'])
+FieldValue = grp(regex_opt_r([grp(FieldContent), LWS]) + r'*')
+MessageHeader = grp(grpm(FieldName) + r':' + grpm(FieldValue))
+