Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
1448d471
Commit
1448d471
authored
Apr 25, 2003
by
Skip Montanaro
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
rework Sniffer api significantly
parent
48816c6f
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
26 additions
and
41 deletions
+26
-41
Lib/csv.py
Lib/csv.py
+26
-41
No files found.
Lib/csv.py
View file @
1448d471
...
@@ -9,6 +9,11 @@ from _csv import Error, __version__, writer, reader, register_dialect, \
...
@@ -9,6 +9,11 @@ from _csv import Error, __version__, writer, reader, register_dialect, \
QUOTE_MINIMAL
,
QUOTE_ALL
,
QUOTE_NONNUMERIC
,
QUOTE_NONE
,
\
QUOTE_MINIMAL
,
QUOTE_ALL
,
QUOTE_NONNUMERIC
,
QUOTE_NONE
,
\
__doc__
__doc__
try
:
from
cStringIO
import
StringIO
except
ImportError
:
from
StringIO
import
StringIO
__all__
=
[
"QUOTE_MINIMAL"
,
"QUOTE_ALL"
,
"QUOTE_NONNUMERIC"
,
"QUOTE_NONE"
,
__all__
=
[
"QUOTE_MINIMAL"
,
"QUOTE_ALL"
,
"QUOTE_NONNUMERIC"
,
"QUOTE_NONE"
,
"Error"
,
"Dialect"
,
"excel"
,
"excel_tab"
,
"reader"
,
"writer"
,
"Error"
,
"Dialect"
,
"excel"
,
"excel_tab"
,
"reader"
,
"writer"
,
"register_dialect"
,
"get_dialect"
,
"list_dialects"
,
"Sniffer"
,
"register_dialect"
,
"get_dialect"
,
"list_dialects"
,
"Sniffer"
,
...
@@ -147,52 +152,39 @@ class DictWriter:
...
@@ -147,52 +152,39 @@ class DictWriter:
class
Sniffer
:
class
Sniffer
:
'''
'''
"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Returns a
csv.
Dialect object.
Returns a Dialect object.
'''
'''
def
__init__
(
self
,
sample
=
16
*
1024
):
def
__init__
(
self
):
# in case there is more than one possible delimiter
# in case there is more than one possible delimiter
self
.
preferred
=
[
','
,
'
\
t
'
,
';'
,
' '
,
':'
]
self
.
preferred
=
[
','
,
'
\
t
'
,
';'
,
' '
,
':'
]
# amount of data (in bytes) to sample
self
.
sample
=
sample
def
sniff
(
self
,
sample
):
def
sniff
(
self
,
fileobj
):
"""
"""
Takes a file-like object and returns a dialect (or None)
Returns a dialect (or None) corresponding to the sample
"""
"""
self
.
fileobj
=
fileobj
data
=
fileobj
.
read
(
self
.
sample
)
quotechar
,
delimiter
,
skipinitialspace
=
\
quotechar
,
delimiter
,
skipinitialspace
=
\
self
.
_guess
QuoteAndDelimiter
(
data
)
self
.
_guess
_quote_and_delimiter
(
sample
)
if
delimiter
is
None
:
if
delimiter
is
None
:
delimiter
,
skipinitialspace
=
self
.
_guess
Delimiter
(
data
)
delimiter
,
skipinitialspace
=
self
.
_guess
_delimiter
(
sample
)
class
SniffedD
ialect
(
Dialect
):
class
d
ialect
(
Dialect
):
_name
=
"sniffed"
_name
=
"sniffed"
lineterminator
=
'
\
r
\
n
'
lineterminator
=
'
\
r
\
n
'
quoting
=
QUOTE_MINIMAL
quoting
=
QUOTE_MINIMAL
# escapechar = ''
# escapechar = ''
doublequote
=
False
doublequote
=
False
SniffedDialect
.
delimiter
=
delimiter
SniffedDialect
.
quotechar
=
quotechar
SniffedDialect
.
skipinitialspace
=
skipinitialspace
self
.
dialect
=
SniffedDialect
dialect
.
delimiter
=
delimiter
return
self
.
dialect
# _csv.reader won't accept a quotechar of ''
dialect
.
quotechar
=
quotechar
or
'"'
dialect
.
skipinitialspace
=
skipinitialspace
return
dialect
def
hasHeaders
(
self
):
return
self
.
_hasHeaders
(
self
.
fileobj
,
self
.
dialect
)
def
_guess_quote_and_delimiter
(
self
,
data
):
def
register_dialect
(
self
,
name
=
'sniffed'
):
register_dialect
(
name
,
self
.
dialect
)
def
_guessQuoteAndDelimiter
(
self
,
data
):
"""
"""
Looks for text enclosed between two identical quotes
Looks for text enclosed between two identical quotes
(the probable quotechar) which are preceded and followed
(the probable quotechar) which are preceded and followed
...
@@ -256,7 +248,7 @@ class Sniffer:
...
@@ -256,7 +248,7 @@ class Sniffer:
return
(
quotechar
,
delim
,
skipinitialspace
)
return
(
quotechar
,
delim
,
skipinitialspace
)
def
_guess
D
elimiter
(
self
,
data
):
def
_guess
_d
elimiter
(
self
,
data
):
"""
"""
The delimiter /should/ occur the same number of times on
The delimiter /should/ occur the same number of times on
each row. However, due to malformed data, it may not. We don't want
each row. However, due to malformed data, it may not. We don't want
...
@@ -290,12 +282,12 @@ class Sniffer:
...
@@ -290,12 +282,12 @@ class Sniffer:
iteration
+=
1
iteration
+=
1
for
line
in
data
[
start
:
end
]:
for
line
in
data
[
start
:
end
]:
for
char
in
ascii
:
for
char
in
ascii
:
meta
f
requency
=
charFrequency
.
get
(
char
,
{})
meta
F
requency
=
charFrequency
.
get
(
char
,
{})
# must count even if frequency is 0
# must count even if frequency is 0
freq
=
line
.
strip
().
count
(
char
)
freq
=
line
.
strip
().
count
(
char
)
# value is the mode
# value is the mode
meta
frequency
[
freq
]
=
metaf
requency
.
get
(
freq
,
0
)
+
1
meta
Frequency
[
freq
]
=
metaF
requency
.
get
(
freq
,
0
)
+
1
charFrequency
[
char
]
=
meta
f
requency
charFrequency
[
char
]
=
meta
F
requency
for
char
in
charFrequency
.
keys
():
for
char
in
charFrequency
.
keys
():
items
=
charFrequency
[
char
].
items
()
items
=
charFrequency
[
char
].
items
()
...
@@ -356,7 +348,7 @@ class Sniffer:
...
@@ -356,7 +348,7 @@ class Sniffer:
return
(
delim
,
skipinitialspace
)
return
(
delim
,
skipinitialspace
)
def
_hasHeaders
(
self
,
fileobj
,
dialect
):
def
has_header
(
self
,
sample
):
# Creates a dictionary of types of data in each column. If any
# Creates a dictionary of types of data in each column. If any
# column is of a single type (say, integers), *except* for the first
# column is of a single type (say, integers), *except* for the first
# row, then the first row is presumed to be labels. If the type
# row, then the first row is presumed to be labels. If the type
...
@@ -373,23 +365,16 @@ class Sniffer:
...
@@ -373,23 +365,16 @@ class Sniffer:
"""
"""
return
eval
(
item
.
replace
(
'('
,
''
).
replace
(
')'
,
''
))
return
eval
(
item
.
replace
(
'('
,
''
).
replace
(
')'
,
''
))
# rewind the fileobj - this might not work for some file-like
rdr
=
reader
(
StringIO
(
sample
),
self
.
sniff
(
sample
))
# objects...
fileobj
.
seek
(
0
)
r
=
csv
.
reader
(
fileobj
,
delimiter
=
dialect
.
delimiter
,
quotechar
=
dialect
.
quotechar
,
skipinitialspace
=
dialect
.
skipinitialspace
)
header
=
r
.
next
()
# assume first row is header
header
=
r
dr
.
next
()
# assume first row is header
columns
=
len
(
header
)
columns
=
len
(
header
)
columnTypes
=
{}
columnTypes
=
{}
for
i
in
range
(
columns
):
columnTypes
[
i
]
=
None
for
i
in
range
(
columns
):
columnTypes
[
i
]
=
None
checked
=
0
checked
=
0
for
row
in
r
:
for
row
in
r
dr
:
# arbitrary number of rows to check, to keep it sane
# arbitrary number of rows to check, to keep it sane
if
checked
>
20
:
if
checked
>
20
:
break
break
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment