Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
125700ad
Commit
125700ad
authored
Jul 08, 1998
by
Guido van Rossum
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Instead of printint, use self.message() or self.note().
parent
0fd9408c
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
63 additions
and
72 deletions
+63
-72
Tools/webchecker/webchecker.py
Tools/webchecker/webchecker.py
+62
-71
Tools/webchecker/websucker.py
Tools/webchecker/websucker.py
+1
-1
No files found.
Tools/webchecker/webchecker.py
View file @
125700ad
...
@@ -249,6 +249,17 @@ class Checker:
...
@@ -249,6 +249,17 @@ class Checker:
self
.
errors
=
{}
self
.
errors
=
{}
self
.
urlopener
=
MyURLopener
()
self
.
urlopener
=
MyURLopener
()
self
.
changed
=
0
self
.
changed
=
0
def
note
(
self
,
level
,
format
,
*
args
):
if
self
.
verbose
>
level
:
if
args
:
format
=
format
%
args
self
.
message
(
format
)
def
message
(
self
,
format
,
*
args
):
if
args
:
format
=
format
%
args
print
format
def
__getstate__
(
self
):
def
__getstate__
(
self
):
return
(
self
.
roots
,
self
.
todo
,
self
.
done
,
self
.
bad
,
self
.
round
)
return
(
self
.
roots
,
self
.
todo
,
self
.
done
,
self
.
bad
,
self
.
round
)
...
@@ -280,23 +291,18 @@ class Checker:
...
@@ -280,23 +291,18 @@ class Checker:
if
self
.
robots
.
has_key
(
root
):
return
if
self
.
robots
.
has_key
(
root
):
return
url
=
urlparse
.
urljoin
(
root
,
"/robots.txt"
)
url
=
urlparse
.
urljoin
(
root
,
"/robots.txt"
)
self
.
robots
[
root
]
=
rp
=
robotparser
.
RobotFileParser
()
self
.
robots
[
root
]
=
rp
=
robotparser
.
RobotFileParser
()
if
self
.
verbose
>
2
:
self
.
note
(
2
,
"Parsing %s"
,
url
)
print
"Parsing"
,
url
rp
.
debug
=
self
.
verbose
>
3
rp
.
debug
=
self
.
verbose
>
3
rp
.
set_url
(
url
)
rp
.
set_url
(
url
)
try
:
try
:
rp
.
read
()
rp
.
read
()
except
IOError
,
msg
:
except
IOError
,
msg
:
if
self
.
verbose
>
1
:
self
.
note
(
1
,
"I/O error parsing %s: %s"
,
url
,
msg
)
print
"I/O error parsing"
,
url
,
":"
,
msg
def
run
(
self
):
def
run
(
self
):
while
self
.
todo
:
while
self
.
todo
:
self
.
round
=
self
.
round
+
1
self
.
round
=
self
.
round
+
1
if
self
.
verbose
>
0
:
self
.
note
(
0
,
"
\
n
Round %d (%s)
\
n
"
,
self
.
round
,
self
.
status
())
print
print
"Round %d (%s)"
%
(
self
.
round
,
self
.
status
())
print
urls
=
self
.
todo
.
keys
()
urls
=
self
.
todo
.
keys
()
urls
.
sort
()
urls
.
sort
()
del
urls
[
self
.
roundsize
:]
del
urls
[
self
.
roundsize
:]
...
@@ -310,40 +316,37 @@ class Checker:
...
@@ -310,40 +316,37 @@ class Checker:
len
(
self
.
bad
))
len
(
self
.
bad
))
def
report
(
self
):
def
report
(
self
):
print
self
.
message
(
""
)
if
not
self
.
todo
:
print
"Final"
,
if
not
self
.
todo
:
s
=
"Final"
else
:
print
"Interim"
,
else
:
s
=
"Interim"
print
"Report (%s)"
%
self
.
status
(
)
self
.
message
(
"%s Report (%s)"
,
s
,
self
.
status
()
)
self
.
report_errors
()
self
.
report_errors
()
def
report_errors
(
self
):
def
report_errors
(
self
):
if
not
self
.
bad
:
if
not
self
.
bad
:
print
self
.
message
(
"
\
n
No errors"
)
print
"No errors"
return
return
print
self
.
message
(
"
\
n
Error Report:"
)
print
"Error Report:"
sources
=
self
.
errors
.
keys
()
sources
=
self
.
errors
.
keys
()
sources
.
sort
()
sources
.
sort
()
for
source
in
sources
:
for
source
in
sources
:
triples
=
self
.
errors
[
source
]
triples
=
self
.
errors
[
source
]
print
self
.
message
(
""
)
if
len
(
triples
)
>
1
:
if
len
(
triples
)
>
1
:
print
len
(
triples
),
"Errors in"
,
source
self
.
message
(
"%d Errors in %s"
,
len
(
triples
),
source
)
else
:
else
:
print
"Error in"
,
source
self
.
message
(
"Error in %s"
,
source
)
for
url
,
rawlink
,
msg
in
triples
:
for
url
,
rawlink
,
msg
in
triples
:
print
" HREF"
,
url
,
if
rawlink
!=
url
:
s
=
" (%s)"
%
rawlink
if
rawlink
!=
url
:
print
"(%s)"
%
rawlink
,
else
:
s
=
""
print
self
.
message
(
" HREF %s%s
\
n
msg %s"
,
url
,
s
,
msg
)
print
" msg"
,
msg
def
dopage
(
self
,
url
):
def
dopage
(
self
,
url
):
if
self
.
verbose
>
1
:
if
self
.
verbose
>
1
:
if
self
.
verbose
>
2
:
if
self
.
verbose
>
2
:
self
.
show
(
"Check "
,
url
,
" from"
,
self
.
todo
[
url
])
self
.
show
(
"Check "
,
url
,
" from"
,
self
.
todo
[
url
])
else
:
else
:
print
"Check "
,
url
self
.
message
(
"Check %s"
,
url
)
page
=
self
.
getpage
(
url
)
page
=
self
.
getpage
(
url
)
if
page
:
if
page
:
for
info
in
page
.
getlinkinfos
():
for
info
in
page
.
getlinkinfos
():
...
@@ -360,18 +363,15 @@ class Checker:
...
@@ -360,18 +363,15 @@ class Checker:
def
newdonelink
(
self
,
url
,
origin
):
def
newdonelink
(
self
,
url
,
origin
):
self
.
done
[
url
].
append
(
origin
)
self
.
done
[
url
].
append
(
origin
)
if
self
.
verbose
>
3
:
self
.
note
(
3
,
" Done link %s"
,
url
)
print
" Done link"
,
url
def
newtodolink
(
self
,
url
,
origin
):
def
newtodolink
(
self
,
url
,
origin
):
if
self
.
todo
.
has_key
(
url
):
if
self
.
todo
.
has_key
(
url
):
self
.
todo
[
url
].
append
(
origin
)
self
.
todo
[
url
].
append
(
origin
)
if
self
.
verbose
>
3
:
self
.
note
(
3
,
" Seen todo link %s"
,
url
)
print
" Seen todo link"
,
url
else
:
else
:
self
.
todo
[
url
]
=
[
origin
]
self
.
todo
[
url
]
=
[
origin
]
if
self
.
verbose
>
3
:
self
.
note
(
3
,
" New todo link %s"
,
url
)
print
" New todo link"
,
url
def
markdone
(
self
,
url
):
def
markdone
(
self
,
url
):
self
.
done
[
url
]
=
self
.
todo
[
url
]
self
.
done
[
url
]
=
self
.
todo
[
url
]
...
@@ -381,18 +381,21 @@ class Checker:
...
@@ -381,18 +381,21 @@ class Checker:
def
inroots
(
self
,
url
):
def
inroots
(
self
,
url
):
for
root
in
self
.
roots
:
for
root
in
self
.
roots
:
if
url
[:
len
(
root
)]
==
root
:
if
url
[:
len
(
root
)]
==
root
:
root
=
urlparse
.
urljoin
(
root
,
"/"
)
return
self
.
isallowed
(
root
,
url
)
return
self
.
robots
[
root
].
can_fetch
(
AGENTNAME
,
url
)
return
0
return
0
def
isallowed
(
self
,
root
,
url
):
root
=
urlparse
.
urljoin
(
root
,
"/"
)
return
self
.
robots
[
root
].
can_fetch
(
AGENTNAME
,
url
)
def
getpage
(
self
,
url
):
def
getpage
(
self
,
url
):
if
url
[:
7
]
==
'mailto:'
or
url
[:
5
]
==
'news:'
:
if
url
[:
7
]
==
'mailto:'
or
url
[:
5
]
==
'news:'
:
if
self
.
verbose
>
1
:
print
" Not checking mailto/news URL"
self
.
note
(
1
,
" Not checking mailto/news URL"
)
return
None
return
None
isint
=
self
.
inroots
(
url
)
isint
=
self
.
inroots
(
url
)
if
not
isint
:
if
not
isint
:
if
not
self
.
checkext
:
if
not
self
.
checkext
:
if
self
.
verbose
>
1
:
print
" Not checking ext link"
self
.
note
(
1
,
" Not checking ext link"
)
return
None
return
None
f
=
self
.
openpage
(
url
)
f
=
self
.
openpage
(
url
)
if
f
:
if
f
:
...
@@ -400,11 +403,10 @@ class Checker:
...
@@ -400,11 +403,10 @@ class Checker:
return
None
return
None
text
,
nurl
=
self
.
readhtml
(
url
)
text
,
nurl
=
self
.
readhtml
(
url
)
if
nurl
!=
url
:
if
nurl
!=
url
:
if
self
.
verbose
>
1
:
self
.
note
(
1
,
" Redirected to %s"
,
nurl
)
print
" Redirected to"
,
nurl
url
=
nurl
url
=
nurl
if
text
:
if
text
:
return
Page
(
text
,
url
,
verbose
=
self
.
verbose
,
maxpage
=
self
.
maxpage
)
return
Page
(
text
,
url
,
maxpage
=
self
.
maxpage
,
checker
=
self
)
def
readhtml
(
self
,
url
):
def
readhtml
(
self
,
url
):
text
=
None
text
=
None
...
@@ -429,8 +431,7 @@ class Checker:
...
@@ -429,8 +431,7 @@ class Checker:
return
self
.
urlopener
.
open
(
url
)
return
self
.
urlopener
.
open
(
url
)
except
IOError
,
msg
:
except
IOError
,
msg
:
msg
=
self
.
sanitize
(
msg
)
msg
=
self
.
sanitize
(
msg
)
if
self
.
verbose
>
0
:
self
.
note
(
0
,
"Error %s"
,
msg
)
print
"Error "
,
msg
if
self
.
verbose
>
0
:
if
self
.
verbose
>
0
:
self
.
show
(
" HREF "
,
url
,
" from"
,
self
.
todo
[
url
])
self
.
show
(
" HREF "
,
url
,
" from"
,
self
.
todo
[
url
])
self
.
setbad
(
url
,
msg
)
self
.
setbad
(
url
,
msg
)
...
@@ -446,21 +447,18 @@ class Checker:
...
@@ -446,21 +447,18 @@ class Checker:
if
ctype
==
'text/html'
:
if
ctype
==
'text/html'
:
return
1
return
1
else
:
else
:
if
self
.
verbose
>
1
:
self
.
note
(
1
,
" Not HTML, mime type %s"
,
ctype
)
print
" Not HTML, mime type"
,
ctype
return
0
return
0
def
setgood
(
self
,
url
):
def
setgood
(
self
,
url
):
if
self
.
bad
.
has_key
(
url
):
if
self
.
bad
.
has_key
(
url
):
del
self
.
bad
[
url
]
del
self
.
bad
[
url
]
self
.
changed
=
1
self
.
changed
=
1
if
self
.
verbose
>
0
:
self
.
note
(
0
,
"(Clear previously seen error)"
)
print
"(Clear previously seen error)"
def
setbad
(
self
,
url
,
msg
):
def
setbad
(
self
,
url
,
msg
):
if
self
.
bad
.
has_key
(
url
)
and
self
.
bad
[
url
]
==
msg
:
if
self
.
bad
.
has_key
(
url
)
and
self
.
bad
[
url
]
==
msg
:
if
self
.
verbose
>
0
:
self
.
note
(
0
,
"(Seen this error before)"
)
print
"(Seen this error before)"
return
return
self
.
bad
[
url
]
=
msg
self
.
bad
[
url
]
=
msg
self
.
changed
=
1
self
.
changed
=
1
...
@@ -485,15 +483,15 @@ class Checker:
...
@@ -485,15 +483,15 @@ class Checker:
# changed into methods so they can be overridden in subclasses.
# changed into methods so they can be overridden in subclasses.
def
show
(
self
,
p1
,
link
,
p2
,
origins
):
def
show
(
self
,
p1
,
link
,
p2
,
origins
):
print
p1
,
link
self
.
message
(
"%s %s"
,
p1
,
link
)
i
=
0
i
=
0
for
source
,
rawlink
in
origins
:
for
source
,
rawlink
in
origins
:
i
=
i
+
1
i
=
i
+
1
if
i
==
2
:
if
i
==
2
:
p2
=
' '
*
len
(
p2
)
p2
=
' '
*
len
(
p2
)
print
p2
,
source
,
if
rawlink
!=
link
:
s
=
" (%s)"
%
rawlink
if
rawlink
!=
link
:
print
"(%s)"
%
rawlink
,
else
:
s
=
""
print
self
.
message
(
"%s %s%s"
,
p2
,
source
,
s
)
def
sanitize
(
self
,
msg
):
def
sanitize
(
self
,
msg
):
if
isinstance
(
IOError
,
ClassType
)
and
isinstance
(
msg
,
IOError
):
if
isinstance
(
IOError
,
ClassType
)
and
isinstance
(
msg
,
IOError
):
...
@@ -521,16 +519,11 @@ class Checker:
...
@@ -521,16 +519,11 @@ class Checker:
def
save_pickle
(
self
,
dumpfile
=
DUMPFILE
):
def
save_pickle
(
self
,
dumpfile
=
DUMPFILE
):
if
not
self
.
changed
:
if
not
self
.
changed
:
if
self
.
verbose
>
0
:
self
.
note
(
0
,
"
\
n
No need to save checkpoint"
)
print
print
"No need to save checkpoint"
elif
not
dumpfile
:
elif
not
dumpfile
:
if
self
.
verbose
>
0
:
self
.
note
(
0
,
"No dumpfile, won't save checkpoint"
)
print
"No dumpfile, won't save checkpoint"
else
:
else
:
if
self
.
verbose
>
0
:
self
.
note
(
0
,
"
\
n
Saving checkpoint to %s ..."
,
dumpfile
)
print
print
"Saving checkpoint to %s ..."
%
dumpfile
newfile
=
dumpfile
+
".new"
newfile
=
dumpfile
+
".new"
f
=
open
(
newfile
,
"wb"
)
f
=
open
(
newfile
,
"wb"
)
pickle
.
dump
(
self
,
f
)
pickle
.
dump
(
self
,
f
)
...
@@ -540,29 +533,26 @@ class Checker:
...
@@ -540,29 +533,26 @@ class Checker:
except
os
.
error
:
except
os
.
error
:
pass
pass
os
.
rename
(
newfile
,
dumpfile
)
os
.
rename
(
newfile
,
dumpfile
)
if
self
.
verbose
>
0
:
self
.
note
(
0
,
"Done."
)
print
"Done."
return
1
return
1
class
Page
:
class
Page
:
def
__init__
(
self
,
text
,
url
,
verbose
=
VERBOSE
,
maxpage
=
MAXPAGE
):
def
__init__
(
self
,
text
,
url
,
verbose
=
VERBOSE
,
maxpage
=
MAXPAGE
,
checker
=
None
):
self
.
text
=
text
self
.
text
=
text
self
.
url
=
url
self
.
url
=
url
self
.
verbose
=
verbose
self
.
verbose
=
verbose
self
.
maxpage
=
maxpage
self
.
maxpage
=
maxpage
self
.
checker
=
checker
def
getlinkinfos
(
self
):
def
getlinkinfos
(
self
):
size
=
len
(
self
.
text
)
size
=
len
(
self
.
text
)
if
size
>
self
.
maxpage
:
if
size
>
self
.
maxpage
:
if
self
.
verbose
>
0
:
self
.
note
(
0
,
"Skip huge file %s (%.0f Kbytes)"
,
self
.
url
,
(
size
*
0.001
))
print
"Skip huge file"
,
self
.
url
print
" (%.0f Kbytes)"
%
(
size
*
0.001
)
return
[]
return
[]
if
self
.
verbose
>
2
:
self
.
checker
.
note
(
2
,
" Parsing %s (%d bytes)"
,
self
.
url
,
size
)
print
" Parsing"
,
self
.
url
,
"(%d bytes)"
%
size
parser
=
MyHTMLParser
(
verbose
=
self
.
verbose
,
checker
=
self
.
checker
)
parser
=
MyHTMLParser
(
verbose
=
self
.
verbose
)
parser
.
feed
(
self
.
text
)
parser
.
feed
(
self
.
text
)
parser
.
close
()
parser
.
close
()
rawlinks
=
parser
.
getlinks
()
rawlinks
=
parser
.
getlinks
()
...
@@ -631,10 +621,11 @@ class MyURLopener(urllib.FancyURLopener):
...
@@ -631,10 +621,11 @@ class MyURLopener(urllib.FancyURLopener):
class
MyHTMLParser
(
sgmllib
.
SGMLParser
):
class
MyHTMLParser
(
sgmllib
.
SGMLParser
):
def
__init__
(
self
,
verbose
=
VERBOSE
):
def
__init__
(
self
,
verbose
=
VERBOSE
,
checker
=
None
):
self
.
myverbose
=
verbose
# now unused
self
.
checker
=
checker
self
.
base
=
None
self
.
base
=
None
self
.
links
=
{}
self
.
links
=
{}
self
.
myverbose
=
verbose
sgmllib
.
SGMLParser
.
__init__
(
self
)
sgmllib
.
SGMLParser
.
__init__
(
self
)
def
start_a
(
self
,
attributes
):
def
start_a
(
self
,
attributes
):
...
@@ -662,8 +653,8 @@ class MyHTMLParser(sgmllib.SGMLParser):
...
@@ -662,8 +653,8 @@ class MyHTMLParser(sgmllib.SGMLParser):
if
name
==
'href'
:
if
name
==
'href'
:
if
value
:
value
=
string
.
strip
(
value
)
if
value
:
value
=
string
.
strip
(
value
)
if
value
:
if
value
:
if
self
.
myverbose
>
1
:
if
self
.
checker
:
print
" Base"
,
value
self
.
checker
.
note
(
1
,
" Base %s"
,
value
)
self
.
base
=
value
self
.
base
=
value
def
getlinks
(
self
):
def
getlinks
(
self
):
...
...
Tools/webchecker/websucker.py
View file @
125700ad
...
@@ -76,7 +76,7 @@ class Sucker(webchecker.Checker):
...
@@ -76,7 +76,7 @@ class Sucker(webchecker.Checker):
f
=
open
(
path
,
"wb"
)
f
=
open
(
path
,
"wb"
)
f
.
write
(
text
)
f
.
write
(
text
)
f
.
close
()
f
.
close
()
print
"saved"
,
path
self
.
message
(
"saved %s"
,
path
)
def
savefilename
(
self
,
url
):
def
savefilename
(
self
,
url
):
type
,
rest
=
urllib
.
splittype
(
url
)
type
,
rest
=
urllib
.
splittype
(
url
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment