string.py 20.1 KB
Newer Older
1
"""A collection of string operations (most are no longer used).
Guido van Rossum's avatar
Guido van Rossum committed
2

3 4 5 6
Warning: most of the code you see here isn't normally used nowadays.
Beginning with Python 1.6, many of these functions are implemented as
methods on the standard string object. They used to be implemented by
a built-in module called strop, but strop is now obsolete itself.
7 8 9 10 11 12 13 14 15 16

Public module variables:

whitespace -- a string containing all characters considered whitespace
lowercase -- a string containing all characters considered lowercase letters
uppercase -- a string containing all characters considered uppercase letters
letters -- a string containing all characters considered letters
digits -- a string containing all characters considered decimal digits
hexdigits -- a string containing all characters considered hexadecimal digits
octdigits -- a string containing all characters considered octal digits
Fred Drake's avatar
Fred Drake committed
17 18
punctuation -- a string containing all characters considered punctuation
printable -- a string containing all characters considered printable
19 20 21

"""

Guido van Rossum's avatar
Guido van Rossum committed
22
# Some strings for ctype-style character classification
23
whitespace = ' \t\n\r\v\f'
Guido van Rossum's avatar
Guido van Rossum committed
24 25 26
lowercase = 'abcdefghijklmnopqrstuvwxyz'
uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
letters = lowercase + uppercase
27 28 29
ascii_lowercase = lowercase
ascii_uppercase = uppercase
ascii_letters = ascii_lowercase + ascii_uppercase
Guido van Rossum's avatar
Guido van Rossum committed
30 31 32
digits = '0123456789'
hexdigits = digits + 'abcdef' + 'ABCDEF'
octdigits = '01234567'
Tim Peters's avatar
Tim Peters committed
33
punctuation = """!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
34
printable = digits + letters + punctuation + whitespace
Guido van Rossum's avatar
Guido van Rossum committed
35 36

# Case conversion helpers
37 38 39 40
# Use str to convert Unicode literal in case of -U
l = map(chr, xrange(256))
_idmap = str('').join(l)
del l
Guido van Rossum's avatar
Guido van Rossum committed
41

42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
# Functions which aren't available as string methods.

# Capitalize the words in a string, e.g. " aBc  dEf " -> "Abc Def".
def capwords(s, sep=None):
    """capwords(s, [sep]) -> string

    Split the argument into words using split, capitalize each
    word using capitalize, and join the capitalized words using
    join. Note that this replaces runs of whitespace characters by
    a single space.

    """
    return (sep or ' ').join([x.capitalize() for x in s.split(sep)])


# Construct a translation string
_idmapL = None
def maketrans(fromstr, tostr):
    """maketrans(frm, to) -> string

    Return a translation table (a string of 256 bytes long)
    suitable for use in string.translate.  The strings frm and to
    must be of the same length.

    """
    if len(fromstr) != len(tostr):
        raise ValueError, "maketrans arguments must have same length"
    global _idmapL
    if not _idmapL:
71
        _idmapL = list(_idmap)
72 73 74 75 76 77 78
    L = _idmapL[:]
    fromstr = map(ord, fromstr)
    for i in range(len(fromstr)):
        L[fromstr[i]] = tostr[i]
    return ''.join(L)


79

80
####################################################################
81 82
import re as _re

83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
class _multimap:
    """Helper class for combining multiple mappings.

    Used by .{safe_,}substitute() to combine the mapping and keyword
    arguments.
    """
    def __init__(self, primary, secondary):
        self._primary = primary
        self._secondary = secondary

    def __getitem__(self, key):
        try:
            return self._primary[key]
        except KeyError:
            return self._secondary[key]


100 101
class _TemplateMetaclass(type):
    pattern = r"""
102 103 104 105 106 107
    %(delim)s(?:
      (?P<escaped>%(delim)s) |   # Escape sequence of two delimiters
      (?P<named>%(id)s)      |   # delimiter and a Python identifier
      {(?P<braced>%(id)s)}   |   # delimiter and a braced identifier
      (?P<invalid>)              # Other ill-formed delimiter exprs
    )
108 109 110
    """

    def __init__(cls, name, bases, dct):
111
        super(_TemplateMetaclass, cls).__init__(name, bases, dct)
112 113 114 115
        if 'pattern' in dct:
            pattern = cls.pattern
        else:
            pattern = _TemplateMetaclass.pattern % {
116
                'delim' : _re.escape(cls.delimiter),
117 118 119 120 121 122
                'id'    : cls.idpattern,
                }
        cls.pattern = _re.compile(pattern, _re.IGNORECASE | _re.VERBOSE)


class Template:
123
    """A string class for supporting $-substitutions."""
124 125
    __metaclass__ = _TemplateMetaclass

126
    delimiter = '$'
127 128 129 130
    idpattern = r'[_a-z][_a-z0-9]*'

    def __init__(self, template):
        self.template = template
131 132

    # Search for $$, $identifier, ${identifier}, and any bare $'s
133

134 135
    def _invalid(self, mo):
        i = mo.start('invalid')
136 137 138 139 140 141 142 143 144 145
        lines = self.template[:i].splitlines(True)
        if not lines:
            colno = 1
            lineno = 1
        else:
            colno = i - len(''.join(lines[:-1]))
            lineno = len(lines)
        raise ValueError('Invalid placeholder in string: line %d, col %d' %
                         (lineno, colno))

146 147 148 149 150
    def substitute(self, *args, **kws):
        if len(args) > 1:
            raise TypeError('Too many positional arguments')
        if not args:
            mapping = kws
151
        elif kws:
152 153 154
            mapping = _multimap(kws, args[0])
        else:
            mapping = args[0]
155
        # Helper function for .sub()
156
        def convert(mo):
157 158 159 160 161 162
            # Check the most common path first.
            named = mo.group('named') or mo.group('braced')
            if named is not None:
                val = mapping[named]
                # We use this idiom instead of str() because the latter will
                # fail if val is a Unicode containing non-ASCII characters.
163
                return '%s' % (val,)
164
            if mo.group('escaped') is not None:
165
                return self.delimiter
166 167
            if mo.group('invalid') is not None:
                self._invalid(mo)
168 169
            raise ValueError('Unrecognized named group in pattern',
                             self.pattern)
170
        return self.pattern.sub(convert, self.template)
171

172 173 174 175 176
    def safe_substitute(self, *args, **kws):
        if len(args) > 1:
            raise TypeError('Too many positional arguments')
        if not args:
            mapping = kws
177
        elif kws:
178 179 180
            mapping = _multimap(kws, args[0])
        else:
            mapping = args[0]
181
        # Helper function for .sub()
182
        def convert(mo):
183
            named = mo.group('named')
184 185
            if named is not None:
                try:
186 187
                    # We use this idiom instead of str() because the latter
                    # will fail if val is a Unicode containing non-ASCII
188
                    return '%s' % (mapping[named],)
189
                except KeyError:
190
                    return self.delimiter + named
191
            braced = mo.group('braced')
192 193
            if braced is not None:
                try:
194
                    return '%s' % (mapping[braced],)
195
                except KeyError:
196
                    return self.delimiter + '{' + braced + '}'
197
            if mo.group('escaped') is not None:
198
                return self.delimiter
199
            if mo.group('invalid') is not None:
200
                return self.delimiter
201 202
            raise ValueError('Unrecognized named group in pattern',
                             self.pattern)
203
        return self.pattern.sub(convert, self.template)
204 205


206

207
####################################################################
208 209 210
# NOTE: Everything below here is deprecated.  Use string methods instead.
# This stuff will go away in Python 3.0.

211 212 213 214 215 216
# Backward compatible names for exceptions
index_error = ValueError
atoi_error = ValueError
atof_error = ValueError
atol_error = ValueError

Guido van Rossum's avatar
Guido van Rossum committed
217 218
# convert UPPER CASE letters to lower case
def lower(s):
219
    """lower(s) -> string
220

221
    Return a copy of the string s converted to lowercase.
222

223 224
    """
    return s.lower()
Guido van Rossum's avatar
Guido van Rossum committed
225 226 227

# Convert lower case letters to UPPER CASE
def upper(s):
228
    """upper(s) -> string
229

230
    Return a copy of the string s converted to uppercase.
231

232 233
    """
    return s.upper()
Guido van Rossum's avatar
Guido van Rossum committed
234 235 236

# Swap lower case letters and UPPER CASE
def swapcase(s):
237
    """swapcase(s) -> string
238

239 240
    Return a copy of the string s with upper case characters
    converted to lowercase and vice versa.
241

242 243
    """
    return s.swapcase()
Guido van Rossum's avatar
Guido van Rossum committed
244 245

# Strip leading and trailing tabs and spaces
246
def strip(s, chars=None):
247
    """strip(s [,chars]) -> string
248

249 250
    Return a copy of the string s with leading and trailing
    whitespace removed.
251
    If chars is given and not None, remove characters in chars instead.
252
    If chars is unicode, S will be converted to unicode before stripping.
253

254
    """
255
    return s.strip(chars)
Guido van Rossum's avatar
Guido van Rossum committed
256

257
# Strip leading tabs and spaces
258 259
def lstrip(s, chars=None):
    """lstrip(s [,chars]) -> string
260

261
    Return a copy of the string s with leading whitespace removed.
262
    If chars is given and not None, remove characters in chars instead.
263

264
    """
265
    return s.lstrip(chars)
266 267

# Strip trailing tabs and spaces
268 269
def rstrip(s, chars=None):
    """rstrip(s [,chars]) -> string
270

271 272
    Return a copy of the string s with trailing whitespace removed.
    If chars is given and not None, remove characters in chars instead.
273

274
    """
275
    return s.rstrip(chars)
276 277


Guido van Rossum's avatar
Guido van Rossum committed
278
# Split a string into a list of space/tab-separated words
279
def split(s, sep=None, maxsplit=-1):
280
    """split(s [,sep [,maxsplit]]) -> list of strings
281

282
    Return a list of the words in the string s, using sep as the
283 284
    delimiter string.  If maxsplit is given, splits at no more than
    maxsplit places (resulting in at most maxsplit+1 words).  If sep
285
    is not specified or is None, any whitespace string is a separator.
286

287
    (split and splitfields are synonymous)
288

289 290 291
    """
    return s.split(sep, maxsplit)
splitfields = split
292

293 294 295 296 297 298 299 300 301 302 303 304
# Split a string into a list of space/tab-separated words
def rsplit(s, sep=None, maxsplit=-1):
    """rsplit(s [,sep [,maxsplit]]) -> list of strings

    Return a list of the words in the string s, using sep as the
    delimiter string, starting at the end of the string and working
    to the front.  If maxsplit is given, at most maxsplit splits are
    done. If sep is not specified or is None, any whitespace string
    is a separator.
    """
    return s.rsplit(sep, maxsplit)

305
# Join fields with optional separator
306 307
def join(words, sep = ' '):
    """join(list [,sep]) -> string
308

309
    Return a string composed of the words in list, with
310
    intervening occurrences of sep.  The default separator is a
311
    single space.
312

313
    (joinfields and join are synonymous)
314

315 316 317
    """
    return sep.join(words)
joinfields = join
318

319 320 321
# Find substring, raise exception if not found
def index(s, *args):
    """index(s, sub [,start [,end]]) -> int
322

323
    Like find but raises ValueError when the substring is not found.
324

325
    """
326
    return s.index(*args)
327

328
# Find last substring, raise exception if not found
329 330
def rindex(s, *args):
    """rindex(s, sub [,start [,end]]) -> int
331

332
    Like rfind but raises ValueError when the substring is not found.
333

334
    """
335
    return s.rindex(*args)
336 337

# Count non-overlapping occurrences of substring
338 339 340 341 342 343 344 345
def count(s, *args):
    """count(s, sub[, start[,end]]) -> int

    Return the number of occurrences of substring sub in string
    s[start:end].  Optional arguments start and end are
    interpreted as in slice notation.

    """
346
    return s.count(*args)
347

348
# Find substring, return -1 if not found
349 350 351 352 353 354 355 356 357 358
def find(s, *args):
    """find(s, sub [,start [,end]]) -> in

    Return the lowest index in s where substring sub is found,
    such that sub is contained within s[start,end].  Optional
    arguments start and end are interpreted as in slice notation.

    Return -1 on failure.

    """
359
    return s.find(*args)
Guido van Rossum's avatar
Guido van Rossum committed
360

361
# Find last substring, return -1 if not found
362 363 364 365 366 367 368 369 370 371
def rfind(s, *args):
    """rfind(s, sub [,start [,end]]) -> int

    Return the highest index in s where substring sub is found,
    such that sub is contained within s[start,end].  Optional
    arguments start and end are interpreted as in slice notation.

    Return -1 on failure.

    """
372
    return s.rfind(*args)
373 374 375 376 377

# for a bit of speed
_float = float
_int = int
_long = long
378

379
# Convert string to float
380 381 382 383 384 385
def atof(s):
    """atof(s) -> float

    Return the floating point number represented by the string s.

    """
386 387
    return _float(s)

388

Guido van Rossum's avatar
Guido van Rossum committed
389
# Convert string to integer
390
def atoi(s , base=10):
391 392 393 394 395 396 397 398 399 400
    """atoi(s [,base]) -> int

    Return the integer represented by the string s in the given
    base, which defaults to 10.  The string s must consist of one
    or more digits, possibly preceded by a sign.  If base is 0, it
    is chosen from the leading characters of s, 0 for octal, 0x or
    0X for hexadecimal.  If base is 16, a preceding 0x or 0X is
    accepted.

    """
401
    return _int(s, base)
402

Guido van Rossum's avatar
Guido van Rossum committed
403

404
# Convert string to long integer
405
def atol(s, base=10):
406 407 408 409 410 411 412 413 414 415 416
    """atol(s [,base]) -> long

    Return the long integer represented by the string s in the
    given base, which defaults to 10.  The string s must consist
    of one or more digits, possibly preceded by a sign.  If base
    is 0, it is chosen from the leading characters of s, 0 for
    octal, 0x or 0X for hexadecimal.  If base is 16, a preceding
    0x or 0X is accepted.  A trailing L or l is not accepted,
    unless base is 0.

    """
417
    return _long(s, base)
418

419

Guido van Rossum's avatar
Guido van Rossum committed
420
# Left-justify a string
421 422
def ljust(s, width, *args):
    """ljust(s, width[, fillchar]) -> string
423

424 425
    Return a left-justified version of s, in a field of the
    specified width, padded with spaces as needed.  The string is
426
    never truncated.  If specified the fillchar is used instead of spaces.
427

428
    """
429
    return s.ljust(width, *args)
Guido van Rossum's avatar
Guido van Rossum committed
430 431

# Right-justify a string
432 433
def rjust(s, width, *args):
    """rjust(s, width[, fillchar]) -> string
434

435 436
    Return a right-justified version of s, in a field of the
    specified width, padded with spaces as needed.  The string is
437
    never truncated.  If specified the fillchar is used instead of spaces.
438

439
    """
440
    return s.rjust(width, *args)
Guido van Rossum's avatar
Guido van Rossum committed
441 442

# Center a string
443 444
def center(s, width, *args):
    """center(s, width[, fillchar]) -> string
445

446 447
    Return a center version of s, in a field of the specified
    width. padded with spaces as needed.  The string is never
448
    truncated.  If specified the fillchar is used instead of spaces.
449

450
    """
451
    return s.center(width, *args)
Guido van Rossum's avatar
Guido van Rossum committed
452 453 454 455 456

# Zero-fill a number, e.g., (12, 3) --> '012' and (-3, 3) --> '-03'
# Decadent feature: the argument may be a string or a number
# (Use of this is deprecated; it should be a string as with ljust c.s.)
def zfill(x, width):
457
    """zfill(x, width) -> string
458

459 460
    Pad a numeric string x with zeros on the left, to fill a field
    of the specified width.  The string x is never truncated.
461

462
    """
463
    if not isinstance(x, basestring):
464 465
        x = repr(x)
    return x.zfill(width)
466 467 468

# Expand tabs in a string.
# Doesn't take non-printing chars into account, but does understand \n.
Guido van Rossum's avatar
Guido van Rossum committed
469
def expandtabs(s, tabsize=8):
470 471 472 473 474 475 476
    """expandtabs(s [,tabsize]) -> string

    Return a copy of the string s with all tab characters replaced
    by the appropriate number of spaces, depending on the current
    column, and the tabsize (default 8).

    """
477
    return s.expandtabs(tabsize)
478

479
# Character translation through look-up table.
480
def translate(s, table, deletions=""):
481
    """translate(s,table [,deletions]) -> string
482 483

    Return a copy of the string s, where all characters occurring
484
    in the optional argument deletions are removed, and the
485
    remaining characters have been mapped through the given
486 487
    translation table, which must be a string of length 256.  The
    deletions argument is not allowed for Unicode strings.
488 489

    """
490
    if deletions or table is None:
491 492 493 494 495 496
        return s.translate(table, deletions)
    else:
        # Add s[:0] so that if s is Unicode and table is an 8-bit string,
        # table is converted to Unicode.  This means that table *cannot*
        # be a dictionary -- for that feature, use u.translate() directly.
        return s.translate(table + s[:0])
497

498 499
# Capitalize a string, e.g. "aBc  dEf" -> "Abc  def".
def capitalize(s):
500
    """capitalize(s) -> string
501

502 503
    Return a copy of the string s with only its first character
    capitalized.
504

505 506
    """
    return s.capitalize()
507

508
# Substring replacement (global)
509
def replace(s, old, new, maxsplit=-1):
510
    """replace (str, old, new[, maxsplit]) -> string
511

512 513 514
    Return a copy of string str with all occurrences of substring
    old replaced by new. If the optional argument maxsplit is
    given, only the first maxsplit occurrences are replaced.
515

516 517
    """
    return s.replace(old, new, maxsplit)
518 519


520 521
# Try importing optional built-in module "strop" -- if it exists,
# it redefines some string operations that are 100-1000 times faster.
522 523
# It also defines values for whitespace, lowercase and uppercase
# that match <ctype.h>'s definitions.
524 525

try:
526 527
    from strop import maketrans, lowercase, uppercase, whitespace
    letters = lowercase + uppercase
528
except ImportError:
Fred Drake's avatar
Fred Drake committed
529
    pass                                          # Use the original versions
530 531 532 533 534

########################################################################
# the Formatter class
# see PEP 3101 for details and purpose of this class

Georg Brandl's avatar
Georg Brandl committed
535 536
# The hard parts are reused from the C implementation.  They're exposed as "_"
# prefixed methods of str and unicode.
537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640

# The overall parser is implemented in str._formatter_parser.
# The field name parser is implemented in str._formatter_field_name_split

class Formatter(object):
    def format(self, format_string, *args, **kwargs):
        return self.vformat(format_string, args, kwargs)

    def vformat(self, format_string, args, kwargs):
        used_args = set()
        result = self._vformat(format_string, args, kwargs, used_args, 2)
        self.check_unused_args(used_args, args, kwargs)
        return result

    def _vformat(self, format_string, args, kwargs, used_args, recursion_depth):
        if recursion_depth < 0:
            raise ValueError('Max string recursion exceeded')
        result = []
        for literal_text, field_name, format_spec, conversion in \
                self.parse(format_string):

            # output the literal text
            if literal_text:
                result.append(literal_text)

            # if there's a field, output it
            if field_name is not None:
                # this is some markup, find the object and do
                #  the formatting

                # given the field_name, find the object it references
                #  and the argument it came from
                obj, arg_used = self.get_field(field_name, args, kwargs)
                used_args.add(arg_used)

                # do any conversion on the resulting object
                obj = self.convert_field(obj, conversion)

                # expand the format spec, if needed
                format_spec = self._vformat(format_spec, args, kwargs,
                                            used_args, recursion_depth-1)

                # format the object and append to the result
                result.append(self.format_field(obj, format_spec))

        return ''.join(result)


    def get_value(self, key, args, kwargs):
        if isinstance(key, (int, long)):
            return args[key]
        else:
            return kwargs[key]


    def check_unused_args(self, used_args, args, kwargs):
        pass


    def format_field(self, value, format_spec):
        return format(value, format_spec)


    def convert_field(self, value, conversion):
        # do any conversion on the resulting object
        if conversion == 'r':
            return repr(value)
        elif conversion == 's':
            return str(value)
        elif conversion is None:
            return value
        raise ValueError("Unknown converion specifier {0!s}".format(conversion))


    # returns an iterable that contains tuples of the form:
    # (literal_text, field_name, format_spec, conversion)
    # literal_text can be zero length
    # field_name can be None, in which case there's no
    #  object to format and output
    # if field_name is not None, it is looked up, formatted
    #  with format_spec and conversion and then used
    def parse(self, format_string):
        return format_string._formatter_parser()


    # given a field_name, find the object it references.
    #  field_name:   the field being looked up, e.g. "0.name"
    #                 or "lookup[3]"
    #  used_args:    a set of which args have been used
    #  args, kwargs: as passed in to vformat
    def get_field(self, field_name, args, kwargs):
        first, rest = field_name._formatter_field_name_split()

        obj = self.get_value(first, args, kwargs)

        # loop through the rest of the field_name, doing
        #  getattr or getitem as needed
        for is_attr, i in rest:
            if is_attr:
                obj = getattr(obj, i)
            else:
                obj = obj[i]

        return obj, first