Commit 1448d471 authored by Skip Montanaro's avatar Skip Montanaro

rework Sniffer api significantly

parent 48816c6f
...@@ -9,6 +9,11 @@ from _csv import Error, __version__, writer, reader, register_dialect, \ ...@@ -9,6 +9,11 @@ from _csv import Error, __version__, writer, reader, register_dialect, \
QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \ QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
__doc__ __doc__
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE", __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
"Error", "Dialect", "excel", "excel_tab", "reader", "writer", "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
"register_dialect", "get_dialect", "list_dialects", "Sniffer", "register_dialect", "get_dialect", "list_dialects", "Sniffer",
...@@ -147,52 +152,39 @@ class DictWriter: ...@@ -147,52 +152,39 @@ class DictWriter:
class Sniffer: class Sniffer:
''' '''
"Sniffs" the format of a CSV file (i.e. delimiter, quotechar) "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Returns a csv.Dialect object. Returns a Dialect object.
''' '''
def __init__(self, sample = 16 * 1024): def __init__(self):
# in case there is more than one possible delimiter # in case there is more than one possible delimiter
self.preferred = [',', '\t', ';', ' ', ':'] self.preferred = [',', '\t', ';', ' ', ':']
# amount of data (in bytes) to sample
self.sample = sample
def sniff(self, sample):
def sniff(self, fileobj):
""" """
Takes a file-like object and returns a dialect (or None) Returns a dialect (or None) corresponding to the sample
""" """
self.fileobj = fileobj
data = fileobj.read(self.sample)
quotechar, delimiter, skipinitialspace = \ quotechar, delimiter, skipinitialspace = \
self._guessQuoteAndDelimiter(data) self._guess_quote_and_delimiter(sample)
if delimiter is None: if delimiter is None:
delimiter, skipinitialspace = self._guessDelimiter(data) delimiter, skipinitialspace = self._guess_delimiter(sample)
class SniffedDialect(Dialect): class dialect(Dialect):
_name = "sniffed" _name = "sniffed"
lineterminator = '\r\n' lineterminator = '\r\n'
quoting = QUOTE_MINIMAL quoting = QUOTE_MINIMAL
# escapechar = '' # escapechar = ''
doublequote = False doublequote = False
SniffedDialect.delimiter = delimiter
SniffedDialect.quotechar = quotechar
SniffedDialect.skipinitialspace = skipinitialspace
self.dialect = SniffedDialect dialect.delimiter = delimiter
return self.dialect # _csv.reader won't accept a quotechar of ''
dialect.quotechar = quotechar or '"'
dialect.skipinitialspace = skipinitialspace
return dialect
def hasHeaders(self):
return self._hasHeaders(self.fileobj, self.dialect)
def _guess_quote_and_delimiter(self, data):
def register_dialect(self, name='sniffed'):
register_dialect(name, self.dialect)
def _guessQuoteAndDelimiter(self, data):
""" """
Looks for text enclosed between two identical quotes Looks for text enclosed between two identical quotes
(the probable quotechar) which are preceded and followed (the probable quotechar) which are preceded and followed
...@@ -256,7 +248,7 @@ class Sniffer: ...@@ -256,7 +248,7 @@ class Sniffer:
return (quotechar, delim, skipinitialspace) return (quotechar, delim, skipinitialspace)
def _guessDelimiter(self, data): def _guess_delimiter(self, data):
""" """
The delimiter /should/ occur the same number of times on The delimiter /should/ occur the same number of times on
each row. However, due to malformed data, it may not. We don't want each row. However, due to malformed data, it may not. We don't want
...@@ -290,12 +282,12 @@ class Sniffer: ...@@ -290,12 +282,12 @@ class Sniffer:
iteration += 1 iteration += 1
for line in data[start:end]: for line in data[start:end]:
for char in ascii: for char in ascii:
metafrequency = charFrequency.get(char, {}) metaFrequency = charFrequency.get(char, {})
# must count even if frequency is 0 # must count even if frequency is 0
freq = line.strip().count(char) freq = line.strip().count(char)
# value is the mode # value is the mode
metafrequency[freq] = metafrequency.get(freq, 0) + 1 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
charFrequency[char] = metafrequency charFrequency[char] = metaFrequency
for char in charFrequency.keys(): for char in charFrequency.keys():
items = charFrequency[char].items() items = charFrequency[char].items()
...@@ -356,7 +348,7 @@ class Sniffer: ...@@ -356,7 +348,7 @@ class Sniffer:
return (delim, skipinitialspace) return (delim, skipinitialspace)
def _hasHeaders(self, fileobj, dialect): def has_header(self, sample):
# Creates a dictionary of types of data in each column. If any # Creates a dictionary of types of data in each column. If any
# column is of a single type (say, integers), *except* for the first # column is of a single type (say, integers), *except* for the first
# row, then the first row is presumed to be labels. If the type # row, then the first row is presumed to be labels. If the type
...@@ -373,23 +365,16 @@ class Sniffer: ...@@ -373,23 +365,16 @@ class Sniffer:
""" """
return eval(item.replace('(', '').replace(')', '')) return eval(item.replace('(', '').replace(')', ''))
# rewind the fileobj - this might not work for some file-like rdr = reader(StringIO(sample), self.sniff(sample))
# objects...
fileobj.seek(0)
r = csv.reader(fileobj,
delimiter=dialect.delimiter,
quotechar=dialect.quotechar,
skipinitialspace=dialect.skipinitialspace)
header = r.next() # assume first row is header header = rdr.next() # assume first row is header
columns = len(header) columns = len(header)
columnTypes = {} columnTypes = {}
for i in range(columns): columnTypes[i] = None for i in range(columns): columnTypes[i] = None
checked = 0 checked = 0
for row in r: for row in rdr:
# arbitrary number of rows to check, to keep it sane # arbitrary number of rows to check, to keep it sane
if checked > 20: if checked > 20:
break break
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment