|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | + |
| 3 | +import re |
| 4 | + |
| 5 | +# INTERNAL USE ONLY REGEX! |
| 6 | + |
| 7 | +NUMBER_RE = re.compile(r'^([+\-]?)((\d+)(\.\d+)?(e\d+)?|\.\d+)$') |
| 8 | + |
| 9 | +URLS_RAW_STRING = ( |
| 10 | + r'([a-z-]+://)' # scheme |
| 11 | + r'([a-z_\d-]+:[a-z_\d-]+@)?' # user:password |
| 12 | + r'(www\.)?' # www. |
| 13 | + r'((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)' # domain |
| 14 | + r'(:\d{2,})?' # port number |
| 15 | + r'(/[a-z\d_%+-]*)*' # folders |
| 16 | + r'(\.[a-z\d_%+-]+)*' # file extension |
| 17 | + r'(\?[a-z\d_+%-=]*)?' # query string |
| 18 | + r'(#\S*)?' # hash |
| 19 | +) |
| 20 | + |
| 21 | +URL_RE = re.compile(r'^{}$'.format(URLS_RAW_STRING), re.IGNORECASE) |
| 22 | + |
| 23 | +URLS_RE = re.compile(r'({})'.format(URLS_RAW_STRING), re.IGNORECASE) |
| 24 | + |
| 25 | +EMAILS_RAW_STRING = r'[a-zA-Z\d._+-]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}' |
| 26 | + |
| 27 | +EMAIL_RE = re.compile(r'^{}$'.format(EMAILS_RAW_STRING)) |
| 28 | + |
| 29 | +EMAILS_RE = re.compile(r'({})'.format(EMAILS_RAW_STRING)) |
| 30 | + |
| 31 | +CAMEL_CASE_TEST_RE = re.compile(r'^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$') |
| 32 | + |
| 33 | +CAMEL_CASE_REPLACE_RE = re.compile(r'([a-z]|[A-Z]+)(?=[A-Z])') |
| 34 | + |
| 35 | +SNAKE_CASE_TEST_RE = re.compile(r'^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$', re.IGNORECASE) |
| 36 | + |
| 37 | +SNAKE_CASE_TEST_DASH_RE = re.compile(r'([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$', re.IGNORECASE) |
| 38 | + |
| 39 | +SNAKE_CASE_REPLACE_RE = re.compile(r'(_)([a-z\d])') |
| 40 | + |
| 41 | +SNAKE_CASE_REPLACE_DASH_RE = re.compile(r'(-)([a-z\d])') |
| 42 | + |
| 43 | +CREDIT_CARDS = { |
| 44 | + 'VISA': re.compile(r'^4\d{12}(?:\d{3})?$'), |
| 45 | + 'MASTERCARD': re.compile(r'^5[1-5]\d{14}$'), |
| 46 | + 'AMERICAN_EXPRESS': re.compile(r'^3[47]\d{13}$'), |
| 47 | + 'DINERS_CLUB': re.compile(r'^3(?:0[0-5]|[68]\d)\d{11}$'), |
| 48 | + 'DISCOVER': re.compile(r'^6(?:011|5\d{2})\d{12}$'), |
| 49 | + 'JCB': re.compile(r'^(?:2131|1800|35\d{3})\d{11}$') |
| 50 | +} |
| 51 | + |
| 52 | +JSON_WRAPPER_RE = re.compile(r'^\s*[\[{]\s*(.*)\s*[\}\]]\s*$', re.MULTILINE | re.DOTALL) |
| 53 | + |
| 54 | +UUID_RE = re.compile(r'^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$', re.IGNORECASE) |
| 55 | + |
| 56 | +SHALLOW_IP_V4_RE = re.compile(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$') |
| 57 | + |
| 58 | +IP_V6_RE = re.compile(r'^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$', re.IGNORECASE) |
| 59 | + |
| 60 | +WORDS_COUNT_RE = re.compile(r'\W*[^\W_]+\W*', re.IGNORECASE | re.MULTILINE | re.UNICODE) |
| 61 | + |
| 62 | +HTML_RE = re.compile( |
| 63 | + r'((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)', |
| 64 | + re.IGNORECASE | re.MULTILINE | re.DOTALL |
| 65 | +) |
| 66 | + |
| 67 | +HTML_TAG_ONLY_RE = re.compile( |
| 68 | + r'(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)', |
| 69 | + re.IGNORECASE | re.MULTILINE | re.DOTALL |
| 70 | +) |
| 71 | + |
| 72 | +SPACES_RE = re.compile(r'\s') |
| 73 | + |
| 74 | +PRETTIFY_RE = { |
| 75 | + # match repetitions of signs that should not be repeated (like multiple spaces or duplicated quotes) |
| 76 | + 'DUPLICATES': re.compile( |
| 77 | + r'(\({2,}|\){2,}|\[{2,}|\]{2,}|{{2,}|\}{2,}|:{2,}|,{2,}|;{2,}|\+{2,}|-{2,}|\s{2,}|%{2,}|={2,}|"{2,}|\'{2,})', |
| 78 | + re.MULTILINE |
| 79 | + ), |
| 80 | + |
| 81 | + # check that a sign cannot have a space before or missing a space after, |
| 82 | + # unless it is a dot or a comma, where numbers may follow (5.5 or 5,5 is ok) |
| 83 | + 'RIGHT_SPACE': re.compile( |
| 84 | + r'(' |
| 85 | + r'(?<=[^\s\d]),(?=[^\s\d])|\s,\s|\s,(?=[^\s\d])|\s,(?!.)|' # comma (,) |
| 86 | + r'(?<=[^\s\d.])\.+(?=[^\s\d.])|\s\.+\s|\s\.+(?=[^\s\d])|\s\.+(?!\.)|' # dot (.) |
| 87 | + r'(?<=\S);(?=\S)|\s;\s|\s;(?=\S)|\s;(?!.)|' # semicolon (;) |
| 88 | + r'(?<=\S):(?=\S)|\s:\s|\s:(?=\S)|\s:(?!.)|' # colon (:) |
| 89 | + r'(?<=[^\s!])!+(?=[^\s!])|\s!+\s|\s!+(?=[^\s!])|\s!+(?!!)|' # exclamation (!) |
| 90 | + r'(?<=[^\s?])\?+(?=[^\s?])|\s\?+\s|\s\?+(?=[^\s?])|\s\?+(?!\?)|' # question (?) |
| 91 | + r'\d%(?=\S)|(?<=\d)\s%\s|(?<=\d)\s%(?=\S)|(?<=\d)\s%(?!.)' # percentage (%) |
| 92 | + r')', |
| 93 | + re.MULTILINE | re.DOTALL |
| 94 | + ), |
| 95 | + |
| 96 | + 'LEFT_SPACE': re.compile( |
| 97 | + r'(' |
| 98 | + |
| 99 | + # quoted text ("hello world") |
| 100 | + r'\s"[^"]+"(?=[?.:!,;])|(?<=\S)"[^"]+"\s|(?<=\S)"[^"]+"(?=[?.:!,;])|' |
| 101 | + |
| 102 | + # text in round brackets |
| 103 | + r'\s\([^)]+\)(?=[?.:!,;])|(?<=\S)\([^)]+\)\s|(?<=\S)(\([^)]+\))(?=[?.:!,;])' |
| 104 | + |
| 105 | + r')', |
| 106 | + re.MULTILINE | re.DOTALL |
| 107 | + ), |
| 108 | + |
| 109 | + # finds the first char in the string (therefore this must not be MULTILINE) |
| 110 | + 'UPPERCASE_FIRST_LETTER': re.compile(r'^\s*\w', re.UNICODE), |
| 111 | + |
| 112 | + # match chars that must be followed by uppercase letters (like ".", "?"...) |
| 113 | + 'UPPERCASE_AFTER_SIGN': re.compile(r'([.?!]\s\w)', re.MULTILINE | re.UNICODE), |
| 114 | + |
| 115 | + 'SPACES_AROUND': re.compile( |
| 116 | + r'(' |
| 117 | + r'(?<=\S)\+(?=\S)|(?<=\S)\+\s|\s\+(?=\S)|' # plus (+) |
| 118 | + r'(?<=\S)-(?=\S)|(?<=\S)-\s|\s-(?=\S)|' # minus (-) |
| 119 | + r'(?<=\S)/(?=\S)|(?<=\S)/\s|\s/(?=\S)|' # division (/) |
| 120 | + r'(?<=\S)\*(?=\S)|(?<=\S)\*\s|\s\*(?=\S)|' # multiplication (*) |
| 121 | + r'(?<=\S)=(?=\S)|(?<=\S)=\s|\s=(?=\S)|' # equal (=) |
| 122 | + |
| 123 | + # quoted text ("hello world") |
| 124 | + r'\s"[^"]+"(?=[^\s?.:!,;])|(?<=\S)"[^"]+"\s|(?<=\S)"[^"]+"(?=[^\s?.:!,;])|' |
| 125 | + |
| 126 | + # text in round brackets |
| 127 | + r'\s\([^)]+\)(?=[^\s?.:!,;])|(?<=\S)\([^)]+\)\s|(?<=\S)(\([^)]+\))(?=[^\s?.:!,;])' |
| 128 | + |
| 129 | + r')', |
| 130 | + re.MULTILINE | re.DOTALL |
| 131 | + ), |
| 132 | + |
| 133 | + 'SPACES_INSIDE': re.compile( |
| 134 | + r'(' |
| 135 | + r'(?<=")[^"]+(?=")|' # quoted text ("hello world") |
| 136 | + r'(?<=\()[^)]+(?=\))' # text in round brackets |
| 137 | + r')', |
| 138 | + re.MULTILINE | re.DOTALL |
| 139 | + ), |
| 140 | + |
| 141 | + 'SAXON_GENITIVE': re.compile( |
| 142 | + r'(' |
| 143 | + r'(?<=\w)\'\ss\s|(?<=\w)\s\'s(?=\w)|(?<=\w)\s\'s\s(?=\w)' |
| 144 | + r')', |
| 145 | + re.MULTILINE | re.UNICODE |
| 146 | + ) |
| 147 | +} |
| 148 | + |
| 149 | +NO_LETTERS_OR_NUMBERS_RE = re.compile(r'[^\w\d]+|_+', re.IGNORECASE | re.UNICODE) |
| 150 | + |
| 151 | +MARGIN_RE = re.compile(r'^[^\S\r\n]+') |
0 commit comments