5.11 KB
import re

# Symbols that are used to represent groups of characters

NUMBERSYMBOL  = 'd'             # 0-9
CHARSYMBOL    = 'e'             # a-zA-Z
NUMCHARSYMBOL = 'f'             # a-zA-Z0-9

# List of characters, that are special to Regex. Listing them here and
# therefore escaping them will help making the Validator secure.
# NOTE: Please do not add '*', since it is used to determine inifinite
# long char symbol rows. (See examples at the of the file.)


class PatternChecker:
    This class defines a basic user friendly checker and processor of
    string values according to pattern.
    It can verify whether a string value fits a certain pattern of
    digits and letters and possible special characters.
    # a dictionary that converts an array of symbols to regex expressions
    symbol_regex_dict = {NUMBERSYMBOL  : '([0-9]{%i,%s})',
                         CHARSYMBOL    : '([a-zA-Z]{%i,%s})',
                         NUMCHARSYMBOL : '([0-9a-zA-Z]{%i,%s})'}

    def _escape(self, match_object):
        """Escape a single character.
        return '\\' +

    def _escape_special_characters(self, s):
        """Escape the characters that have a special meaning in regex.
        return re.sub('[' + DANGEROUSCHARS + ']', self._escape, s)

    def _unescape_special_characters(self, s):
        """Reverse the escaping, so that the final string is as close as
        possible to the original one.
        return re.sub('\\\\', '', s)

    def _replace_symbol_by_regex(self, match_object):
        """Replace the character symbol with their respective regex.
        length = len(

        # Yikes, what a hack! But I could not come up with something better.
        if[-1] == '*':
            min = length - 1
            max = ''
            min = length
            max = str(min)

        return self.symbol_regex_dict[[0]] %(min, max)

    def make_regex_from_pattern(self, pattern):
        """Replaces all symbol occurences and creates a complete regex
        regex = self._escape_special_characters(pattern)
            regex = re.sub(symbol+'{1,}\*?', self._replace_symbol_by_regex, regex)
        return '^ *' + regex + ' *$'

    def construct_value_from_match(self, result, pattern):
        """After we validated the string, we put it back together; this is
        good, since we can easily clean up the data this way.
        value = self._escape_special_characters(pattern)
        _symbols = '['+NUMBERSYMBOL + CHARSYMBOL + NUMCHARSYMBOL + ']'
        re_obj = re.compile(_symbols+'{1,}\*?')
        for res in result.groups():
            match =
            value = value[:match.start()] + res + value[match.end():]
        return value

    def clean_value(self, value):
        """Clean up unnecessary white characters.
        # same as string.strip, but since I am using re everywhere here,
        # why not use it now too?
        value = re.sub('^\s*', '', value)
        value = re.sub('\s*$', '', value)
        # make out of several white spaces, one whitespace...
        value = re.sub('  *', ' ', value)
        return value

    def validate_value(self, patterns, value):
        """Validate method that manges the entire validation process.

        The validator goes through each pattern and
        tries to get a match to the value (second parameter). At the end, the
        first pattern of the list is taken to construct the value again; this
        ensures data cleansing and a common data look.
        value = self.clean_value(value)

        result = None
        for pattern in patterns:
            regex = self.make_regex_from_pattern(pattern)
            re_obj = re.compile(regex)
            result =
            if result:

        if not result:
            return None

        value = self.construct_value_from_match(result, patterns[0])
        return self._unescape_special_characters(value)

if __name__ == '__main__':

    val = PatternChecker()

    # American long ZIP
    print val.validate_value(['ddddd-dddd'], '34567-1298')
    print val.validate_value(['ddddd-dddd'], '  34567-1298  \t  ')

    # American phone number
    print val.validate_value(['(ddd) ddd-dddd', 'ddd-ddd-dddd',
                              'ddd ddd-dddd'],
                             '(345) 678-1298')
    print val.validate_value(['(ddd) ddd-dddd', 'ddd-ddd-dddd',
                              'ddd ddd-dddd'],

    # American money
    print val.validate_value(['$ d*.dd'], '$ 1345345.00')

    # German money
    print val.validate_value(['d*.dd DM'], '267.98 DM')

    # German license plate
    print val.validate_value(['eee ee-ddd'], 'OSL HR-683')

    # German phone number (international)
    print val.validate_value(['+49 (d*) d*'], '+49 (3574) 7253')
    print val.validate_value(['+49 (d*) d*'], '+49  (3574)  7253')