project1/proj1_s4498062/webhttp/regexes.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158

from os.path import commonprefix
from itertools import groupby
import re

def grpm(regex):
    return grp(regex, matching=True)

def grp(regex, matching=False):
    return r'(' + (r'' if matching else r'?:') + regex + r')'

def opt(regex):
    return grp(grp(regex) + r'?')

def regex_opt_r(regexes):
    return grp(r'|'.join(regexes))

# The below functions were taken from the pygments package 
# (http://pygmenst.org), in particular pygments.regexopt and pygments.lexer
# Some small modifications have been made.
def regex_opt_inner(strings, open_paren):
    """Return a regex that matches any string in the sorted list of strings."""
    close_paren = open_paren and ')' or ''
    # print strings, repr(open_paren)
    if not strings:
        # print '-> nothing left'
        return ''
    first = strings[0]
    if len(strings) == 1:
        # print '-> only 1 string'
        return open_paren + re.escape(first) + close_paren
    if not first:
        # print '-> first string empty'
        return open_paren + regex_opt_inner(strings[1:], '(?:') \
            + '?' + close_paren
    if len(first) == 1:
        # multiple one-char strings? make a charset
        oneletter = []
        rest = []
        for s in strings:
            if len(s) == 1:
                oneletter.append(s)
            else:
                rest.append(s)
        if len(oneletter) > 1:  # do we have more than one oneletter string?
            if rest:
                # print '-> 1-character + rest'
                return open_paren + regex_opt_inner(rest, '') + '|' \
                    + make_charset(oneletter) + close_paren
            # print '-> only 1-character'
            return make_charset(oneletter)
    prefix = commonprefix(strings)
    if prefix:
        plen = len(prefix)
        # we have a prefix for all strings
        # print '-> prefix:', prefix
        return open_paren + re.escape(prefix) \
            + regex_opt_inner([s[plen:] for s in strings], '(?:') \
            + close_paren
    # is there a suffix?
    strings_rev = [s[::-1] for s in strings]
    suffix = commonprefix(strings_rev)
    if suffix:
        slen = len(suffix)
        # print '-> suffix:', suffix[::-1]
        return open_paren \
            + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \
            + re.escape(suffix[::-1]) + close_paren
    # recurse on common 1-string prefixes
    # print '-> last resort'
    return open_paren + \
        '|'.join(regex_opt_inner(list(group[1]), '')
                 for group in groupby(strings, lambda s: s[0] == first[0])) \
        + close_paren

def regex_opt(strings, prefix='', suffix=''):
    """Return a compiled regex that matches any string in the given list.

    The strings to match must be literal strings, not regexes.  They will be
    regex-escaped.

    *prefix* and *suffix* are pre- and appended to the final regex.
    """
    strings = sorted(strings)
    return prefix + regex_opt_inner(strings, '(?:') + suffix

## From here it is own work again
# RFC 2396
IPv4address = grp(r'(?:\d{1,3}\.){3}\d{1,3}')
reserved = grp(r'[;\/?:@&=+$,]')
alphanum = grp(r'[\da-zA-Z]')
mark = grp(r'[\-_\.!~\*\'\(\)]')
hex = grp(r'[\da-fA-F]')
unreserved = regex_opt_r([alphanum, mark])
escaped = grp(r'%' + hex + hex)
pchar = regex_opt_r([unreserved, escaped, r'[:@&=+$,]'])
param = grp(pchar + r'*')
segment = grp(pchar + r'*' + grp(r';' + param) + r'*')
path_segments = grp(segment + grp(r'\/' + segment) + r'*')
abs_path = grp(r'\/' + path_segments)
scheme = grp(r'[a-zA-Z](?:[a-zA-Z\d+\-\.]*)')
userinfo = grp(regex_opt_r([unreserved, escaped, r'[;:&=+$,]']) + r'*')
domainlabel = grp(r'[a-zA-Z\d]|(?:[a-zA-Z\d](?:[a-zA-Z\d\-])*[a-zA-Z\d])')
toplabel = grp(r'[a-zA-Z]|(?:[a-zA-Z](?:[a-zA-Z\d\-])*[a-zA-Z\d])')
hostname = grp(opt(domainlabel + r'\.') + r'*' + toplabel + opt(r'\.'))
host = regex_opt_r([hostname, IPv4address])
port = r'\d+'
hostport = grp(host + opt(r':' + port))
server = opt(opt(userinfo + r'@') + hostport)
reg_name = grp(regex_opt_r([unreserved, escaped, r'[;:&=+]']) + r'*')
authority = regex_opt_r([server, reg_name])
net_path = grp(r'\/\/' + authority + opt(abs_path))
hier_part = regex_opt_r([net_path, abs_path])
uric = regex_opt_r([reserved, unreserved, escaped])
uric_no_slash = regex_opt_r([unreserved, escaped, r'[;?:@&=+$,]'])
opaque_part = grp(uric_no_slash + grp(uric) + r'*')
absoluteURI = grp(scheme + r':' + regex_opt_r([hier_part, opaque_part]))

# RFC 2616
CTL = r'[\x00-\x1f\x7f]'
CR = r'\r'
LF = r'\n'
CRLF = CR + LF
HT = r'\t'
SP = r' '
LWS = grp(opt(CRLF) + regex_opt_r([SP, HT]))
TEXT = grp(r'[^\x00-\x1f\x7f]|' + LWS)
TEXT_NO_LWS = r'[^\x00-\x1f\x7f \t]'

separator = r'[\(\)<>@,;:\\"\/\[\]?=\{\} \t]'
token = r'[^\x00-\x1f\(\)<>@,;:\\"\/\[\]?=\{\} \t]+'
qdtext = r'^\x00-\x08\x0b-\x0c\x0e-\x1f\x7f"]'
quotedPair = r'\\[\x00-\x7f]'
quotedString = grp(r'"' + regex_opt_r([qdtext, quotedPair]) + r'*"')
qvalue = regex_opt_r([r'0(?:\.\d{0,3})?', r'1(?:\.0{0,3})?'])

HTTPVersion = r'HTTP\/\d\.\d'
Method = regex_opt(['OPTIONS', 'GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE',
       'CONNECT'])
RequestURI = regex_opt_r([r'\*', absoluteURI, abs_path, authority])
RequestLine = grp(grpm(Method) + SP + grpm(RequestURI) + SP +
        grpm(HTTPVersion) + CRLF)

StatusCode = r'\d{3}'
ReasonPhrase = r'[^\r\n]*'
StatusLine = grp(grpm(HTTPVersion) + SP + grpm(StatusCode) + SP + 
        grpm(ReasonPhrase) + CRLF)

FieldName = token
FieldContent = regex_opt_r([TEXT_NO_LWS + TEXT + r'*(?!' + LWS + r')'])
FieldValue = grp(regex_opt_r([grp(FieldContent), LWS]) + r'*')
MessageHeader = grp(grpm(FieldName) + r':' + grpm(FieldValue))

ETagSplit = grp(r',' + LWS + r'*')
EncodingSplit = ETagSplit
contentCoding = token
coding = regex_opt_r([contentCoding, r'\*'])
AcceptEncodingValue = grp(grpm(coding) + grp(r';q=' + grpm(qvalue)) + r'?')