Add csv module from CPython and _csv from PyPy

96d0a086 · YOU · Dylan Trotter · 11807baa · 96d0a086 · 96d0a086
Commit 96d0a086 authored Jan 16, 2017 by YOU Committed by Dylan Trotter Jan 16, 2017
Show whitespace changes
Inline Side-by-side

Showing with 1051 additions and 0 deletions

third_party/pypy/_csv.py third_party/pypy/_csv.py +582 -0

third_party/stdlib/csv.py third_party/stdlib/csv.py +469 -0

No files found.
--- a/third_party/pypy/_csv.py
+++ b/third_party/pypy/_csv.py
+__doc__ = """CSV parsing and writing.
+This module provides classes that assist in the reading and writing
+of Comma Separated Value (CSV) files, and implements the interface
+described by PEP 305.  Although many CSV files are simple to parse,
+the format is not formally defined by a stable specification and
+is subtle enough that parsing lines of a CSV file with something
+like line.split(\",\") is bound to fail.  The module supports three
+basic APIs: reading, writing, and registration of dialects.
+DIALECT REGISTRATION:
+Readers and writers support a dialect argument, which is a convenient
+handle on a group of settings.  When the dialect argument is a string,
+it identifies one of the dialects previously registered with the module.
+If it is a class or instance, the attributes of the argument are used as
+the settings for the reader or writer:
+    class excel:
+        delimiter = ','
+        quotechar = '\"'
+        escapechar = None
+        doublequote = True
+        skipinitialspace = False
+        lineterminator = '\\r\\n'
+        quoting = QUOTE_MINIMAL
+SETTINGS:
+    * quotechar - specifies a one-character string to use as the
+        quoting character.  It defaults to '\"'.
+    * delimiter - specifies a one-character string to use as the
+        field separator.  It defaults to ','.
+    * skipinitialspace - specifies how to interpret whitespace which
+        immediately follows a delimiter.  It defaults to False, which
+        means that whitespace immediately following a delimiter is part
+        of the following field.
+    * lineterminator -  specifies the character sequence which should
+        terminate rows.
+    * quoting - controls when quotes should be generated by the writer.
+        It can take on any of the following module constants:
+        csv.QUOTE_MINIMAL means only when required, for example, when a
+            field contains either the quotechar or the delimiter
+        csv.QUOTE_ALL means that quotes are always placed around fields.
+        csv.QUOTE_NONNUMERIC means that quotes are always placed around
+            fields which do not parse as integers or floating point
+            numbers.
+        csv.QUOTE_NONE means that quotes are never placed around fields.
+    * escapechar - specifies a one-character string used to escape
+        the delimiter when quoting is set to QUOTE_NONE.
+    * doublequote - controls the handling of quotes inside fields.  When
+        True, two consecutive quotes are interpreted as one during read,
+        and when writing, each quote character embedded in the data is
+        written as two quotes.
+"""
+__version__ = "1.0"
+__all__ = [
+    'Dialect', 'Error', 'QUOTE_ALL', 'QUOTE_MINIMAL', 'QUOTE_NONE',
+    'QUOTE_NONNUMERIC', 'Reader', 'Writer', '__doc__', '__version__',
+    '_call_dialect', '_dialects', '_field_limit', 'field_size_limit',
+    'get_dialect', 'list_dialects', 'reader', 'register_dialect',
+    'undefined', 'unregister_dialect', 'writer'
+]
+QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE = range(4)
+_dialects = {}
+_field_limit = 128 * 1024 # max parsed field size
+class Error(Exception):
+    pass
+class Dialect(object):
+    """CSV dialect
+    The Dialect type records CSV parsing and generation options."""
+    __slots__ = ["_delimiter", "_doublequote", "_escapechar",
+                 "_lineterminator", "_quotechar", "_quoting",
+                 "_skipinitialspace", "_strict"]
+    def __new__(cls, dialect, **kwargs):
+        for name in kwargs:
+            if '_' + name not in Dialect.__slots__:
+                raise TypeError("unexpected keyword argument '%s'" %
+                                (name,))
+        if dialect is not None:
+            if isinstance(dialect, basestring):
+                dialect = get_dialect(dialect)
+            # Can we reuse this instance?
+            if (isinstance(dialect, Dialect)
+                and all(value is None for value in kwargs.itervalues())):
+                return dialect
+        self = object.__new__(cls)
+        def set_char(x):
+            if x is None:
+                return None
+            if isinstance(x, str) and len(x) <= 1:
+                return x
+            raise TypeError("%r must be a 1-character string" % (name,))
+        def set_str(x):
+            if isinstance(x, str):
+                return x
+            raise TypeError("%r must be a string" % (name,))
+        def set_quoting(x):
+            if x in range(4):
+                return x
+            raise TypeError("bad 'quoting' value")
+        attributes = {"delimiter": (',', set_char),
+                      "doublequote": (True, bool),
+                      "escapechar": (None, set_char),
+                      "lineterminator": ("\r\n", set_str),
+                      "quotechar": ('"', set_char),
+                      "quoting": (QUOTE_MINIMAL, set_quoting),
+                      "skipinitialspace": (False, bool),
+                      "strict": (False, bool),
+                      }
+        # Copy attributes
+        notset = object()
+        for name in Dialect.__slots__:
+            name = name[1:]
+            value = notset
+            if name in kwargs:
+                value = kwargs[name]
+            elif dialect is not None:
+                value = getattr(dialect, name, notset)
+            # mapping by name: (default, converter)
+            if value is notset:
+                value = attributes[name][0]
+                if name == 'quoting' and not self.quotechar:
+                    value = QUOTE_NONE
+            else:
+                converter = attributes[name][1]
+                if converter:
+                    value = converter(value)
+            # setattr(self, '_' + name, value)
+            self.__dict__['_' + name] = value
+        if not self.delimiter:
+            raise TypeError("delimiter must be set")
+        if self.quoting != QUOTE_NONE and not self.quotechar:
+            raise TypeError("quotechar must be set if quoting enabled")
+        if not self.lineterminator:
+            raise TypeError("lineterminator must be set")
+        return self
+    delimiter        = property(lambda self: self._delimiter)
+    doublequote      = property(lambda self: self._doublequote)
+    escapechar       = property(lambda self: self._escapechar)
+    lineterminator   = property(lambda self: self._lineterminator)
+    quotechar        = property(lambda self: self._quotechar)
+    quoting          = property(lambda self: self._quoting)
+    skipinitialspace = property(lambda self: self._skipinitialspace)
+    strict           = property(lambda self: self._strict)
+def _call_dialect(dialect_inst, kwargs):
+    return Dialect(dialect_inst, **kwargs)
+def register_dialect(name, dialect=None, **kwargs):
+    """Create a mapping from a string name to a dialect class.
+    dialect = csv.register_dialect(name, dialect)"""
+    if not isinstance(name, basestring):
+        raise TypeError("dialect name must be a string or unicode")
+    dialect = _call_dialect(dialect, kwargs)
+    _dialects[name] = dialect
+def unregister_dialect(name):
+    """Delete the name/dialect mapping associated with a string name.\n
+    csv.unregister_dialect(name)"""
+    try:
+        del _dialects[name]
+    except KeyError:
+        raise Error("unknown dialect")
+def get_dialect(name):
+    """Return the dialect instance associated with name.
+    dialect = csv.get_dialect(name)"""
+    try:
+        return _dialects[name]
+    except KeyError:
+        raise Error("unknown dialect")
+def list_dialects():
+    """Return a list of all know dialect names
+    names = csv.list_dialects()"""
+    return list(_dialects)
+class Reader(object):
+    """CSV reader
+    Reader objects are responsible for reading and parsing tabular data
+    in CSV format."""
+    (START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD,
+     IN_QUOTED_FIELD, ESCAPE_IN_QUOTED_FIELD, QUOTE_IN_QUOTED_FIELD,
+     EAT_CRNL) = range(8)
+    def __init__(self, iterator, dialect=None, **kwargs):
+        self.dialect = _call_dialect(dialect, kwargs)
+        self.input_iter = iter(iterator)
+        self.line_num = 0
+        self._parse_reset()
+    def _parse_reset(self):
+        self.field = ''
+        self.fields = []
+        self.state = self.START_RECORD
+        self.numeric_field = False
+    def __iter__(self):
+        return self
+    def next(self):
+        self._parse_reset()
+        while True:
+            try:
+                line = next(self.input_iter)
+            except StopIteration:
+                # End of input OR exception
+                if len(self.field) > 0:
+                    raise Error("newline inside string")
+                raise
+            self.line_num += 1
+            if '\0' in line:
+                raise Error("line contains NULL byte")
+            pos = 0
+            while pos < len(line):
+                pos = self._parse_process_char(line, pos)
+            self._parse_eol()
+            if self.state == self.START_RECORD:
+                break
+        fields = self.fields
+        self.fields = []
+        return fields
+    def _parse_process_char(self, line, pos):
+        c = line[pos]
+        if self.state == self.IN_FIELD:
+            # in unquoted field
+            pos2 = pos
+            while True:
+                if c in '\n\r':
+                    # end of line - return [fields]
+                    if pos2 > pos:
+                        self._parse_add_char(line[pos:pos2])
+                        pos = pos2
+                    self._parse_save_field()
+                    self.state = self.EAT_CRNL
+                elif c == self.dialect.escapechar:
+                    # possible escaped character
+                    pos2 -= 1
+                    self.state = self.ESCAPED_CHAR
+                elif c == self.dialect.delimiter:
+                    # save field - wait for new field
+                    if pos2 > pos:
+                        self._parse_add_char(line[pos:pos2])
+                        pos = pos2
+                    self._parse_save_field()
+                    self.state = self.START_FIELD
+                else:
+                    # normal character - save in field
+                    pos2 += 1
+                    if pos2 < len(line):
+                        c = line[pos2]
+                        continue
+                break
+            if pos2 > pos:
+                self._parse_add_char(line[pos:pos2])
+                pos = pos2 - 1
+        elif self.state == self.START_RECORD:
+            if c in '\n\r':
+                self.state = self.EAT_CRNL
+            else:
+                self.state = self.START_FIELD
+                # restart process
+                self._parse_process_char(line, pos)
+        elif self.state == self.START_FIELD:
+            if c in '\n\r':
+                # save empty field - return [fields]
+                self._parse_save_field()
+                self.state = self.EAT_CRNL
+            elif (c == self.dialect.quotechar
+                  and self.dialect.quoting != QUOTE_NONE):
+                # start quoted field
+                self.state = self.IN_QUOTED_FIELD
+            elif c == self.dialect.escapechar:
+                # possible escaped character
+                self.state = self.ESCAPED_CHAR
+            elif c == ' ' and self.dialect.skipinitialspace:
+                # ignore space at start of field
+                pass
+            elif c == self.dialect.delimiter:
+                # save empty field
+                self._parse_save_field()
+            else:
+                # begin new unquoted field
+                if self.dialect.quoting == QUOTE_NONNUMERIC:
+                    self.numeric_field = True
+                self._parse_add_char(c)
+                self.state = self.IN_FIELD
+        elif self.state == self.ESCAPED_CHAR:
+            self._parse_add_char(c)
+            self.state = self.IN_FIELD
+        elif self.state == self.IN_QUOTED_FIELD:
+            if c == self.dialect.escapechar:
+                # possible escape character
+                self.state = self.ESCAPE_IN_QUOTED_FIELD
+            elif (c == self.dialect.quotechar
+                  and self.dialect.quoting != QUOTE_NONE):
+                if self.dialect.doublequote:
+                    # doublequote; " represented by ""
+                    self.state = self.QUOTE_IN_QUOTED_FIELD
+                else:
+                    #end of quote part of field
+                    self.state = self.IN_FIELD
+            else:
+                # normal character - save in field
+                self._parse_add_char(c)
+        elif self.state == self.ESCAPE_IN_QUOTED_FIELD:
+            self._parse_add_char(c)
+            self.state = self.IN_QUOTED_FIELD
+        elif self.state == self.QUOTE_IN_QUOTED_FIELD:
+            # doublequote - seen a quote in a quoted field
+            if (c == self.dialect.quotechar
+                and self.dialect.quoting != QUOTE_NONE):
+                # save "" as "
+                self._parse_add_char(c)
+                self.state = self.IN_QUOTED_FIELD
+            elif c == self.dialect.delimiter:
+                # save field - wait for new field
+                self._parse_save_field()
+                self.state = self.START_FIELD
+            elif c in '\r\n':
+                # end of line - return [fields]
+                self._parse_save_field()
+                self.state = self.EAT_CRNL
+            elif not self.dialect.strict:
+                self._parse_add_char(c)
+                self.state = self.IN_FIELD
+            else:
+                raise Error("'%c' expected after '%c'" %
+                            (self.dialect.delimiter, self.dialect.quotechar))
+        elif self.state == self.EAT_CRNL:
+            if c not in '\r\n':
+                raise Error("new-line character seen in unquoted field - "
+                            "do you need to open the file "
+                            "in universal-newline mode?")
+        else:
+            raise RuntimeError("unknown state: %r" % (self.state,))
+        return pos + 1
+    def _parse_eol(self):
+        if self.state == self.EAT_CRNL:
+            self.state = self.START_RECORD
+        elif self.state == self.START_RECORD:
+            # empty line - return []
+            pass
+        elif self.state == self.IN_FIELD:
+            # in unquoted field
+            # end of line - return [fields]
+            self._parse_save_field()
+            self.state = self.START_RECORD
+        elif self.state == self.START_FIELD:
+            # save empty field - return [fields]
+            self._parse_save_field()
+            self.state = self.START_RECORD
+        elif self.state == self.ESCAPED_CHAR:
+            self._parse_add_char('\n')
+            self.state = self.IN_FIELD
+        elif self.state == self.IN_QUOTED_FIELD:
+            pass
+        elif self.state == self.ESCAPE_IN_QUOTED_FIELD:
+            self._parse_add_char('\n')
+            self.state = self.IN_QUOTED_FIELD
+        elif self.state == self.QUOTE_IN_QUOTED_FIELD:
+            # end of line - return [fields]
+            self._parse_save_field()
+            self.state = self.START_RECORD
+        else:
+            raise RuntimeError("unknown state: %r" % (self.state,))
+    def _parse_save_field(self):
+        field, self.field = self.field, ''
+        if self.numeric_field:
+            self.numeric_field = False
+            field = float(field)
+        self.fields.append(field)
+    def _parse_add_char(self, c):
+        if len(self.field) + len(c) > _field_limit:
+            raise Error("field larger than field limit (%d)" % (_field_limit))
+        self.field += c
+class Writer(object):
+    """CSV writer
+    Writer objects are responsible for generating tabular data
+    in CSV format from sequence input."""
+    def __init__(self, file, dialect=None, **kwargs):
+        if not (hasattr(file, 'write') and callable(file.write)):
+            raise TypeError("argument 1 must have a 'write' method")
+        self.writeline = file.write
+        self.dialect = _call_dialect(dialect, kwargs)
+    def _join_reset(self):
+        self.rec = []
+        self.num_fields = 0
+    def _join_append(self, field, quoted, quote_empty):
+        dialect = self.dialect
+        # If this is not the first field we need a field separator
+        if self.num_fields > 0:
+            self.rec.append(dialect.delimiter)
+        if dialect.quoting == QUOTE_NONE:
+            need_escape = tuple(dialect.lineterminator) + (
+                dialect.escapechar,  # escapechar always first
+                dialect.delimiter, dialect.quotechar)
+        else:
+            for c in tuple(dialect.lineterminator) + (
+                dialect.delimiter, dialect.escapechar):
+                if c and c in field:
+                    quoted = True
+            need_escape = ()
+            if dialect.quotechar in field:
+                if dialect.doublequote:
+                    field = field.replace(dialect.quotechar,
+                                          dialect.quotechar * 2)
+                    quoted = True
+                else:
+                    need_escape = (dialect.quotechar,)
+        for c in need_escape:
+            if c and c in field:
+                if not dialect.escapechar:
+                    raise Error("need to escape, but no escapechar set")
+                field = field.replace(c, dialect.escapechar + c)
+        # If field is empty check if it needs to be quoted
+        if field == '' and quote_empty:
+            if dialect.quoting == QUOTE_NONE:
+                raise Error("single empty field record must be quoted")
+            quoted = 1
+        if quoted:
+            field = dialect.quotechar + field + dialect.quotechar
+        self.rec.append(field)
+        self.num_fields += 1
+    def writerow(self, row):
+        dialect = self.dialect
+        try:
+            rowlen = len(row)
+        except TypeError:
+            raise Error("sequence expected")
+        # join all fields in internal buffer
+        self._join_reset()
+        for field in row:
+            quoted = False
+            if dialect.quoting == QUOTE_NONNUMERIC:
+                try:
+                    float(field)
+                except:
+                    quoted = True
+                # This changed since 2.5:
+                # quoted = not isinstance(field, (int, long, float))
+            elif dialect.quoting == QUOTE_ALL:
+                quoted = True
+            if field is None:
+                value = ""
+            elif isinstance(field, float):
+                value = repr(field)
+            else:
+                value = str(field)
+            self._join_append(value, quoted, rowlen == 1)
+        # add line terminator
+        self.rec.append(dialect.lineterminator)
+        self.writeline(''.join(self.rec))
+    def writerows(self, rows):
+        for row in rows:
+            self.writerow(row)
+def reader(*args, **kwargs):
+    """
+    csv_reader = reader(iterable [, dialect='excel']
+                       [optional keyword args])
+    for row in csv_reader:
+        process(row)
+    The "iterable" argument can be any object that returns a line
+    of input for each iteration, such as a file object or a list.  The
+    optional \"dialect\" parameter is discussed below.  The function
+    also accepts optional keyword arguments which override settings
+    provided by the dialect.
+    The returned object is an iterator.  Each iteration returns a row
+    of the CSV file (which can span multiple input lines)"""
+    return Reader(*args, **kwargs)
+def writer(*args, **kwargs):
+    """
+    csv_writer = csv.writer(fileobj [, dialect='excel']
+                            [optional keyword args])
+    for row in sequence:
+        csv_writer.writerow(row)
+    [or]
+    csv_writer = csv.writer(fileobj [, dialect='excel']
+                            [optional keyword args])
+    csv_writer.writerows(rows)
+    The \"fileobj\" argument can be any object that supports the file API."""
+    return Writer(*args, **kwargs)
+undefined = object()
+def field_size_limit(limit=undefined):
+    """Sets an upper limit on parsed fields.
+    csv.field_size_limit([limit])
+    Returns old limit. If limit is not given, no new limit is set and
+    the old limit is returned"""
+    global _field_limit
+    old_limit = _field_limit
+    if limit is not undefined:
+        if not isinstance(limit, (int, long)):
+            raise TypeError("int expected, got %s" %
+                            (limit.__class__.__name__,))
+        _field_limit = limit
+    return old_limit
--- a/third_party/stdlib/csv.py
+++ b/third_party/stdlib/csv.py
+"""
+csv.py - read/write/investigate CSV files
+"""
+import re
+import functools
+reduce = functools.reduce
+# from functools import reduce
+# TODO: Support from foo import * syntax.
+import _csv
+for name in _csv.__all__:
+  globals()[name] = getattr(_csv, name)
+# from _csv import Error, __version__, writer, reader, register_dialect, \
+#                  unregister_dialect, get_dialect, list_dialects, \
+#                  field_size_limit, \
+#                  QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
+#                  __doc__
+# from _csv import Dialect as _Dialect
+_Dialect = _csv.Dialect
+import StringIO as _StringIO
+StringIO = _StringIO.StringIO
+# try:
+#     from cStringIO import StringIO
+# except ImportError:
+#     from StringIO import StringIO
+__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
+            "Error", "Dialect", "__doc__", "excel", "excel_tab",
+            "field_size_limit", "reader", "writer",
+            "register_dialect", "get_dialect", "list_dialects", "Sniffer",
+            "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
+class Dialect(object):
+    """Describe an Excel dialect.
+    This must be subclassed (see csv.excel).  Valid attributes are:
+    delimiter, quotechar, escapechar, doublequote, skipinitialspace,
+    lineterminator, quoting.
+    """
+    _name = ""
+    _valid = False
+    # placeholders
+    delimiter = None
+    quotechar = None
+    escapechar = None
+    doublequote = None
+    skipinitialspace = None
+    lineterminator = None
+    quoting = None
+    def __init__(self):
+        if self.__class__ != Dialect:
+            self._valid = True
+        self._validate()
+    def _validate(self):
+        try:
+            _Dialect(self)
+        except TypeError, e:
+            # We do this for compatibility with py2.3
+            raise Error(str(e))
+class excel(Dialect):
+    """Describe the usual properties of Excel-generated CSV files."""
+    delimiter = ','
+    quotechar = '"'
+    doublequote = True
+    skipinitialspace = False
+    lineterminator = '\r\n'
+    quoting = QUOTE_MINIMAL
+register_dialect("excel", excel)
+class excel_tab(excel):
+    """Describe the usual properties of Excel-generated TAB-delimited files."""
+    delimiter = '\t'
+register_dialect("excel-tab", excel_tab)
+class DictReader(object):
+    def __init__(self, f, fieldnames=None, restkey=None, restval=None,
+                 dialect="excel", *args, **kwds):
+        self._fieldnames = fieldnames   # list of keys for the dict
+        self.restkey = restkey          # key to catch long rows
+        self.restval = restval          # default value for short rows
+        self.reader = reader(f, dialect, *args, **kwds)
+        self.dialect = dialect
+        self.line_num = 0
+    def __iter__(self):
+        return self
+    # @property
+    def fieldnames(self):
+        if self._fieldnames is None:
+            try:
+                self._fieldnames = self.reader.next()
+            except StopIteration:
+                pass
+        self.line_num = self.reader.line_num
+        return self._fieldnames
+    fieldnames = property(fieldnames)
+    # Issue 20004: Because DictReader is a classic class, this setter is
+    # ignored.  At this point in 2.7's lifecycle, it is too late to change the
+    # base class for fear of breaking working code.  If you want to change
+    # fieldnames without overwriting the getter, set _fieldnames directly.
+    @fieldnames.setter
+    def fieldnames(self, value):
+        self._fieldnames = value
+    def next(self):
+        if self.line_num == 0:
+            # Used only for its side effect.
+            self.fieldnames
+        row = self.reader.next()
+        self.line_num = self.reader.line_num
+        # unlike the basic reader, we prefer not to return blanks,
+        # because we will typically wind up with a dict full of None
+        # values
+        while row == []:
+            row = self.reader.next()
+        d = dict(zip(self.fieldnames, row))
+        lf = len(self.fieldnames)
+        lr = len(row)
+        if lf < lr:
+            d[self.restkey] = row[lf:]
+        elif lf > lr:
+            for key in self.fieldnames[lr:]:
+                d[key] = self.restval
+        return d
+class DictWriter(object):
+    def __init__(self, f, fieldnames, restval="", extrasaction="raise",
+                 dialect="excel", *args, **kwds):
+        self.fieldnames = fieldnames    # list of keys for the dict
+        self.restval = restval          # for writing short dicts
+        if extrasaction.lower() not in ("raise", "ignore"):
+            raise ValueError, \
+                  ("extrasaction (%s) must be 'raise' or 'ignore'" %
+                   extrasaction)
+        self.extrasaction = extrasaction
+        self.writer = writer(f, dialect, *args, **kwds)
+    def writeheader(self):
+        header = dict(zip(self.fieldnames, self.fieldnames))
+        self.writerow(header)
+    def _dict_to_list(self, rowdict):
+        if self.extrasaction == "raise":
+            wrong_fields = [k for k in rowdict if k not in self.fieldnames]
+            if wrong_fields:
+                raise ValueError("dict contains fields not in fieldnames: "
+                                 + ", ".join([repr(x) for x in wrong_fields]))
+        return [rowdict.get(key, self.restval) for key in self.fieldnames]
+    def writerow(self, rowdict):
+        return self.writer.writerow(self._dict_to_list(rowdict))
+    def writerows(self, rowdicts):
+        rows = []
+        for rowdict in rowdicts:
+            rows.append(self._dict_to_list(rowdict))
+        return self.writer.writerows(rows)
+# Guard Sniffer's type checking against builds that exclude complex()
+# try:
+#     complex
+# except NameError:
+#     complex = float
+complex = float
+class Sniffer(object):
+    '''
+    "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
+    Returns a Dialect object.
+    '''
+    def __init__(self):
+        # in case there is more than one possible delimiter
+        self.preferred = [',', '\t', ';', ' ', ':']
+    def sniff(self, sample, delimiters=None):
+        """
+        Returns a dialect (or None) corresponding to the sample
+        """
+        quotechar, doublequote, delimiter, skipinitialspace = \
+                   self._guess_quote_and_delimiter(sample, delimiters)
+        if not delimiter:
+            delimiter, skipinitialspace = self._guess_delimiter(sample,
+                                                                delimiters)
+        if not delimiter:
+            raise Error, "Could not determine delimiter"
+        class dialect(Dialect):
+            _name = "sniffed"
+            lineterminator = '\r\n'
+            quoting = QUOTE_MINIMAL
+            # escapechar = ''
+        dialect.doublequote = doublequote
+        dialect.delimiter = delimiter
+        # _csv.reader won't accept a quotechar of ''
+        dialect.quotechar = quotechar or '"'
+        dialect.skipinitialspace = skipinitialspace
+        return dialect
+    def _guess_quote_and_delimiter(self, data, delimiters):
+        """
+        Looks for text enclosed between two identical quotes
+        (the probable quotechar) which are preceded and followed
+        by the same character (the probable delimiter).
+        For example:
+                         ,'some text',
+        The quote with the most wins, same with the delimiter.
+        If there is no quotechar the delimiter can't be determined
+        this way.
+        """
+        matches = []
+        for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
+                      '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
+                      '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
+                      '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
+            regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
+            matches = regexp.findall(data)
+            if matches:
+                break
+        if not matches:
+            # (quotechar, doublequote, delimiter, skipinitialspace)
+            return ('', False, None, 0)
+        quotes = {}
+        delims = {}
+        spaces = 0
+        for m in matches:
+            n = regexp.groupindex['quote'] - 1
+            key = m[n]
+            if key:
+                quotes[key] = quotes.get(key, 0) + 1
+            try:
+                n = regexp.groupindex['delim'] - 1
+                key = m[n]
+            except KeyError:
+                continue
+            if key and (delimiters is None or key in delimiters):
+                delims[key] = delims.get(key, 0) + 1
+            try:
+                n = regexp.groupindex['space'] - 1
+            except KeyError:
+                continue
+            if m[n]:
+                spaces += 1
+        quotechar = reduce(lambda a, b, quotes = quotes:
+                           (quotes[a] > quotes[b]) and a or b, quotes.keys())
+        if delims:
+            delim = reduce(lambda a, b, delims = delims:
+                           (delims[a] > delims[b]) and a or b, delims.keys())
+            skipinitialspace = delims[delim] == spaces
+            if delim == '\n': # most likely a file with a single column
+                delim = ''
+        else:
+            # there is *no* delimiter, it's a single column of quoted data
+            delim = ''
+            skipinitialspace = 0
+        # if we see an extra quote between delimiters, we've got a
+        # double quoted format
+        dq_regexp = re.compile(
+                               r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
+                               {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
+        if dq_regexp.search(data):
+            doublequote = True
+        else:
+            doublequote = False
+        return (quotechar, doublequote, delim, skipinitialspace)
+    def _guess_delimiter(self, data, delimiters):
+        """
+        The delimiter /should/ occur the same number of times on
+        each row. However, due to malformed data, it may not. We don't want
+        an all or nothing approach, so we allow for small variations in this
+        number.
+          1) build a table of the frequency of each character on every line.
+          2) build a table of frequencies of this frequency (meta-frequency?),
+             e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
+             7 times in 2 rows'
+          3) use the mode of the meta-frequency to determine the /expected/
+             frequency for that character
+          4) find out how often the character actually meets that goal
+          5) the character that best meets its goal is the delimiter
+        For performance reasons, the data is evaluated in chunks, so it can
+        try and evaluate the smallest portion of the data possible, evaluating
+        additional chunks as necessary.
+        """
+        data = filter(None, data.split('\n'))
+        ascii = [chr(c) for c in range(127)] # 7-bit ASCII
+        # build frequency tables
+        chunkLength = min(10, len(data))
+        iteration = 0
+        charFrequency = {}
+        modes = {}
+        delims = {}
+        start, end = 0, min(chunkLength, len(data))
+        while start < len(data):
+            iteration += 1
+            for line in data[start:end]:
+                for char in ascii:
+                    metaFrequency = charFrequency.get(char, {})
+                    # must count even if frequency is 0
+                    freq = line.count(char)
+                    # value is the mode
+                    metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
+                    charFrequency[char] = metaFrequency
+            for char in charFrequency.keys():
+                items = charFrequency[char].items()
+                if len(items) == 1 and items[0][0] == 0:
+                    continue
+                # get the mode of the frequencies
+                if len(items) > 1:
+                    modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
+                                         items)
+                    # adjust the mode - subtract the sum of all
+                    # other frequencies
+                    items.remove(modes[char])
+                    modes[char] = (modes[char][0], modes[char][1]
+                                   - reduce(lambda a, b: (0, a[1] + b[1]),
+                                            items)[1])
+                else:
+                    modes[char] = items[0]
+            # build a list of possible delimiters
+            modeList = modes.items()
+            total = float(chunkLength * iteration)
+            # (rows of consistent data) / (number of rows) = 100%
+            consistency = 1.0
+            # minimum consistency threshold
+            threshold = 0.9
+            while len(delims) == 0 and consistency >= threshold:
+                for k, v in modeList:
+                    if v[0] > 0 and v[1] > 0:
+                        if ((v[1]/total) >= consistency and
+                            (delimiters is None or k in delimiters)):
+                            delims[k] = v
+                consistency -= 0.01
+            if len(delims) == 1:
+                delim = delims.keys()[0]
+                skipinitialspace = (data[0].count(delim) ==
+                                    data[0].count("%c " % delim))
+                return (delim, skipinitialspace)
+            # analyze another chunkLength lines
+            start = end
+            end += chunkLength
+        if not delims:
+            return ('', 0)
+        # if there's more than one, fall back to a 'preferred' list
+        if len(delims) > 1:
+            for d in self.preferred:
+                if d in delims.keys():
+                    skipinitialspace = (data[0].count(d) ==
+                                        data[0].count("%c " % d))
+                    return (d, skipinitialspace)
+        # nothing else indicates a preference, pick the character that
+        # dominates(?)
+        items = [(v,k) for (k,v) in delims.items()]
+        items.sort()
+        delim = items[-1][1]
+        skipinitialspace = (data[0].count(delim) ==
+                            data[0].count("%c " % delim))
+        return (delim, skipinitialspace)
+    def has_header(self, sample):
+        # Creates a dictionary of types of data in each column. If any
+        # column is of a single type (say, integers), *except* for the first
+        # row, then the first row is presumed to be labels. If the type
+        # can't be determined, it is assumed to be a string in which case
+        # the length of the string is the determining factor: if all of the
+        # rows except for the first are the same length, it's a header.
+        # Finally, a 'vote' is taken at the end for each column, adding or
+        # subtracting from the likelihood of the first row being a header.
+        rdr = reader(StringIO(sample), self.sniff(sample))
+        header = rdr.next() # assume first row is header
+        columns = len(header)
+        columnTypes = {}
+        for i in range(columns): columnTypes[i] = None
+        checked = 0
+        for row in rdr:
+            # arbitrary number of rows to check, to keep it sane
+            if checked > 20:
+                break
+            checked += 1
+            if len(row) != columns:
+                continue # skip rows that have irregular number of columns
+            for col in columnTypes.keys():
+                for thisType in [int, long, float, complex]:
+                    try:
+                        thisType(row[col])
+                        break
+                    except (ValueError, OverflowError):
+                        pass
+                else:
+                    # fallback to length of string
+                    thisType = len(row[col])
+                # treat longs as ints
+                if thisType == long:
+                    thisType = int
+                if thisType != columnTypes[col]:
+                    if columnTypes[col] is None: # add new column type
+                        columnTypes[col] = thisType
+                    else:
+                        # type is inconsistent, remove column from
+                        # consideration
+                        del columnTypes[col]
+        # finally, compare results against first row and "vote"
+        # on whether it's a header
+        hasHeader = 0
+        for col, colType in columnTypes.items():
+            if type(colType) == type(0): # it's a length
+                if len(header[col]) != colType:
+                    hasHeader += 1
+                else:
+                    hasHeader -= 1
+            else: # attempt typecast
+                try:
+                    colType(header[col])
+                except (ValueError, TypeError):
+                    hasHeader += 1
+                else:
+                    hasHeader -= 1
+        return hasHeader > 0