path: root/project1/proj1_s4498062/webhttp/regexes.py



from os.path import commonprefix
from itertools import groupby
import re

def grpm(regex):
    return grp(regex, matching=True)

def grp(regex, matching=False):
    return r'(' + (r'' if matching else r'?:') + regex + r')'

def opt(regex):
    return grp(grp(regex) + r'?')

def regex_opt_r(regexes):
    return grp(r'|'.join(regexes))

# The below functions were taken from the pygments package 
# (http://pygmenst.org), in particular pygments.regexopt and pygments.lexer
# Some small modifications have been made.
def regex_opt_inner(strings, open_paren):
    """Return a regex that matches any string in the sorted list of strings."""
    close_paren = open_paren and ')' or ''
    # print strings, repr(open_paren)
    if not strings:
        # print '-> nothing left'
        return ''
    first = strings[0]
    if len(strings) == 1:
        # print '-> only 1 string'
        return open_paren + re.escape(first) + close_paren
    if not first:
        # print '-> first string empty'
        return open_paren + regex_opt_inner(strings[1:], '(?:') \
            + '?' + close_paren
    if len(first) == 1:
        # multiple one-char strings? make a charset
        oneletter = []
        rest = []
        for s in strings:
            if len(s) == 1:
                oneletter.append(s)
            else:
                rest.append(s)
        if len(oneletter) > 1:  # do we have more than one oneletter string?
            if rest:
                # print '-> 1-character + rest'
                return open_paren + regex_opt_inner(rest, '') + '|' \
                    + make_charset(oneletter) + close_paren
            # print '-> only 1-character'
            return make_charset(oneletter)
    prefix = commonprefix(strings)
    if prefix:
        plen = len(prefix)
        # we have a prefix for all strings
        # print '-> prefix:', prefix
        return open_paren + re.escape(prefix) \
            + regex_opt_inner([s[plen:] for s in strings], '(?:') \
            + close_paren
    # is there a suffix?
    strings_rev = [s[::-1] for s in strings]
    suffix = commonprefix(strings_rev)
    if suffix:
        slen = len(suffix)
        # print '-> suffix:', suffix[::-1]
        return open_paren \
            + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \
            + re.escape(suffix[::-1]) + close_paren
    # recurse on common 1-string prefixes
    # print '-> last resort'
    return open_paren + \
        '|'.join(regex_opt_inner(list(group[1]), '')
                 for group in groupby(strings, lambda s: s[0] == first[0])) \
        + close_paren

def regex_opt(strings, prefix='', suffix=''):
    """Return a compiled regex that matches any string in the given list.

    The strings to match must be literal strings, not regexes.  They will be
    regex-escaped.

    *prefix* and *suffix* are pre- and appended to the final regex.
    """
    strings = sorted(strings)
    return prefix + regex_opt_inner(strings, '(?:') + suffix

## From here it is own work again
# RFC 2396
IPv4address = grp(r'(?:\d{1,3}\.){3}\d{1,3}')
reserved = grp(r'[;\/?:@&=+$,]')
alphanum = grp(r'[\da-zA-Z]')
mark = grp(r'[\-_\.!~\*\'\(\)]')
hex = grp(r'[\da-fA-F]')
unreserved = regex_opt_r([alphanum, mark])
escaped = grp(r'%' + hex + hex)
pchar = regex_opt_r([unreserved, escaped, r'[:@&=+$,]'])
param = grp(pchar + r'*')
segment = grp(pchar + r'*' + grp(r';' + param) + r'*')
path_segments = grp(segment + grp(r'\/' + segment) + r'*')
abs_path = grp(r'\/' + path_segments)
scheme = grp(r'[a-zA-Z](?:[a-zA-Z\d+\-\.]*)')
userinfo = grp(regex_opt_r([unreserved, escaped, r'[;:&=+$,]']) + r'*')
domainlabel = grp(r'[a-zA-Z\d]|(?:[a-zA-Z\d](?:[a-zA-Z\d\-])*[a-zA-Z\d])')
toplabel = grp(r'[a-zA-Z]|(?:[a-zA-Z](?:[a-zA-Z\d\-])*[a-zA-Z\d])')
hostname = grp(opt(domainlabel + r'\.') + r'*' + toplabel + opt(r'\.'))
host = regex_opt_r([hostname, IPv4address])
port = r'\d+'
hostport = grp(host + opt(r':' + port))
server = opt(opt(userinfo + r'@') + hostport)
reg_name = grp(regex_opt_r([unreserved, escaped, r'[;:&=+]']) + r'*')
authority = regex_opt_r([server, reg_name])
net_path = grp(r'\/\/' + authority + opt(abs_path))
hier_part = regex_opt_r([net_path, abs_path])
uric = regex_opt_r([reserved, unreserved, escaped])
uric_no_slash = regex_opt_r([unreserved, escaped, r'[;?:@&=+$,]'])
opaque_part = grp(uric_no_slash + grp(uric) + r'*')
absoluteURI = grp(scheme + r':' + regex_opt_r([hier_part, opaque_part]))

# RFC 2616
CTL = r'[\x00-\x1f\x7f]'
CR = r'\r'
LF = r'\n'
CRLF = CR + LF
HT = r'\t'
SP = r' '
LWS = grp(opt(CRLF) + regex_opt_r([SP, HT]))
TEXT = grp(r'[^\x00-\x1f\x7f]|' + LWS)
TEXT_NO_LWS = r'[^\x00-\x1f\x7f \t]'

separator = r'[\(\)<>@,;:\\"\/\[\]?=\{\} \t]'
token = r'[^\x00-\x1f\(\)<>@,;:\\"\/\[\]?=\{\} \t]+'
qdtext = r'^\x00-\x08\x0b-\x0c\x0e-\x1f\x7f"]'
quotedPair = r'\\[\x00-\x7f]'
quotedString = grp(r'"' + regex_opt_r([qdtext, quotedPair]) + r'*"')
qvalue = regex_opt_r([r'0(?:\.\d{0,3})?', r'1(?:\.0{0,3})?'])

HTTPVersion = r'HTTP\/\d\.\d'
Method = regex_opt(['OPTIONS', 'GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE',
       'CONNECT'])
RequestURI = regex_opt_r([r'\*', absoluteURI, abs_path, authority])
RequestLine = grp(grpm(Method) + SP + grpm(RequestURI) + SP +
        grpm(HTTPVersion) + CRLF)

StatusCode = r'\d{3}'
ReasonPhrase = r'[^\r\n]*'
StatusLine = grp(grpm(HTTPVersion) + SP + grpm(StatusCode) + SP + 
        grpm(ReasonPhrase) + CRLF)

FieldName = token
FieldContent = regex_opt_r([TEXT_NO_LWS + TEXT + r'*(?!' + LWS + r')'])
FieldValue = grp(regex_opt_r([grp(FieldContent), LWS]) + r'*')
MessageHeader = grp(grpm(FieldName) + r':' + grpm(FieldValue))

ETagSplit = grp(r',' + LWS + r'*')
EncodingSplit = ETagSplit
contentCoding = token
coding = regex_opt_r([contentCoding, r'\*'])
AcceptEncodingValue = grp(grpm(coding) + grp(r';q=' + grpm(qvalue)) + r'?')
from os.path import commonprefix
from itertools import groupby
import re

def grpm(regex):
    return grp(regex, matching=True)

def grp(regex, matching=False):
    return r'(' + (r'' if matching else r'?:') + regex + r')'

def opt(regex):
    return grp(grp(regex) + r'?')

def regex_opt_r(regexes):
    return grp(r'|'.join(regexes))

# The below functions were taken from the pygments package 
# (http://pygmenst.org), in particular pygments.regexopt and pygments.lexer
# Some small modifications have been made.
def regex_opt_inner(strings, open_paren):
    """Return a regex that matches any string in the sorted list of strings."""
    close_paren = open_paren and ')' or ''
    # print strings, repr(open_paren)
    if not strings:
        # print '-> nothing left'
        return ''
    first = strings[0]
    if len(strings) == 1:
        # print '-> only 1 string'
        return open_paren + re.escape(first) + close_paren
    if not first:
        # print '-> first string empty'
        return open_paren + regex_opt_inner(strings[1:], '(?:') \
            + '?' + close_paren
    if len(first) == 1:
        # multiple one-char strings? make a charset
        oneletter = []
        rest = []
        for s in strings:
            if len(s) == 1:
                oneletter.append(s)
            else:
                rest.append(s)
        if len(oneletter) > 1:  # do we have more than one oneletter string?
            if rest:
                # print '-> 1-character + rest'
                return open_paren + regex_opt_inner(rest, '') + '|' \
                    + make_charset(oneletter) + close_paren
            # print '-> only 1-character'
            return make_charset(oneletter)
    prefix = commonprefix(strings)
    if prefix:
        plen = len(prefix)
        # we have a prefix for all strings
        # print '-> prefix:', prefix
        return open_paren + re.escape(prefix) \
            + regex_opt_inner([s[plen:] for s in strings], '(?:') \
            + close_paren
    # is there a suffix?
    strings_rev = [s[::-1] for s in strings]
    suffix = commonprefix(strings_rev)
    if suffix:
        slen = len(suffix)
        # print '-> suffix:', suffix[::-1]
        return open_paren \
            + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \
            + re.escape(suffix[::-1]) + close_paren
    # recurse on common 1-string prefixes
    # print '-> last resort'
    return open_paren + \
        '|'.join(regex_opt_inner(list(group[1]), '')
                 for group in groupby(strings, lambda s: s[0] == first[0])) \
        + close_paren

def regex_opt(strings, prefix='', suffix=''):
    """Return a compiled regex that matches any string in the given list.

    The strings to match must be literal strings, not regexes.  They will be
    regex-escaped.

    *prefix* and *suffix* are pre- and appended to the final regex.
    """
    strings = sorted(strings)
    return prefix + regex_opt_inner(strings, '(?:') + suffix

## From here it is own work again
# RFC 2396
IPv4address = grp(r'(?:\d{1,3}\.){3}\d{1,3}')
reserved = grp(r'[;\/?:@&=+$,]')
alphanum = grp(r'[\da-zA-Z]')
mark = grp(r'[\-_\.!~\*\'\(\)]')
hex = grp(r'[\da-fA-F]')
unreserved = regex_opt_r([alphanum, mark])
escaped = grp(r'%' + hex + hex)
pchar = regex_opt_r([unreserved, escaped, r'[:@&=+$,]'])
param = grp(pchar + r'*')
segment = grp(pchar + r'*' + grp(r';' + param) + r'*')
path_segments = grp(segment + grp(r'\/' + segment) + r'*')
abs_path = grp(r'\/' + path_segments)
scheme = grp(r'[a-zA-Z](?:[a-zA-Z\d+\-\.]*)')
userinfo = grp(regex_opt_r([unreserved, escaped, r'[;:&=+$,]']) + r'*')
domainlabel = grp(r'[a-zA-Z\d]|(?:[a-zA-Z\d](?:[a-zA-Z\d\-])*[a-zA-Z\d])')
toplabel = grp(r'[a-zA-Z]|(?:[a-zA-Z](?:[a-zA-Z\d\-])*[a-zA-Z\d])')
hostname = grp(opt(domainlabel + r'\.') + r'*' + toplabel + opt(r'\.'))
host = regex_opt_r([hostname, IPv4address])
port = r'\d+'
hostport = grp(host + opt(r':' + port))
server = opt(opt(userinfo + r'@') + hostport)
reg_name = grp(regex_opt_r([unreserved, escaped, r'[;:&=+]']) + r'*')
authority = regex_opt_r([server, reg_name])
net_path = grp(r'\/\/' + authority + opt(abs_path))
hier_part = regex_opt_r([net_path, abs_path])
uric = regex_opt_r([reserved, unreserved, escaped])
uric_no_slash = regex_opt_r([unreserved, escaped, r'[;?:@&=+$,]'])
opaque_part = grp(uric_no_slash + grp(uric) + r'*')
absoluteURI = grp(scheme + r':' + regex_opt_r([hier_part, opaque_part]))

# RFC 2616
CTL = r'[\x00-\x1f\x7f]'
CR = r'\r'
LF = r'\n'
CRLF = CR + LF
HT = r'\t'
SP = r' '
LWS = grp(opt(CRLF) + regex_opt_r([SP, HT]))
TEXT = grp(r'[^\x00-\x1f\x7f]|' + LWS)
TEXT_NO_LWS = r'[^\x00-\x1f\x7f \t]'

separator = r'[\(\)<>@,;:\\"\/\[\]?=\{\} \t]'
token = r'[^\x00-\x1f\(\)<>@,;:\\"\/\[\]?=\{\} \t]+'
qdtext = r'^\x00-\x08\x0b-\x0c\x0e-\x1f\x7f"]'
quotedPair = r'\\[\x00-\x7f]'
quotedString = grp(r'"' + regex_opt_r([qdtext, quotedPair]) + r'*"')
qvalue = regex_opt_r([r'0(?:\.\d{0,3})?', r'1(?:\.0{0,3})?'])

HTTPVersion = r'HTTP\/\d\.\d'
Method = regex_opt(['OPTIONS', 'GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE',
       'CONNECT'])
RequestURI = regex_opt_r([r'\*', absoluteURI, abs_path, authority])
RequestLine = grp(grpm(Method) + SP + grpm(RequestURI) + SP +
        grpm(HTTPVersion) + CRLF)

StatusCode = r'\d{3}'
ReasonPhrase = r'[^\r\n]*'
StatusLine = grp(grpm(HTTPVersion) + SP + grpm(StatusCode) + SP + 
        grpm(ReasonPhrase) + CRLF)

FieldName = token
FieldContent = regex_opt_r([TEXT_NO_LWS + TEXT + r'*(?!' + LWS + r')'])
FieldValue = grp(regex_opt_r([grp(FieldContent), LWS]) + r'*')
MessageHeader = grp(grpm(FieldName) + r':' + grpm(FieldValue))

ETagSplit = grp(r',' + LWS + r'*')
EncodingSplit = ETagSplit
contentCoding = token
coding = regex_opt_r([contentCoding, r'\*'])
AcceptEncodingValue = grp(grpm(coding) + grp(r';q=' + grpm(qvalue)) + r'?')