Commit 0a28bc52 authored by Benjamin Peterson's avatar Benjamin Peterson

remove old, outdated tool

parent 4fd283a4
Webchecker
----------
This is a simple web tree checker, useful to find bad links in a web
tree. It currently checks links pointing within the same subweb for
validity. The main program is "webchecker.py". See its doc string
(or invoke it with the option "-?") for more defails.
History:
- Jan 1997. First release. The module robotparser.py was written by
Skip Montanaro; the rest is original work by Guido van Rossum.
- May 1999. Sam Bayer contributed a new version, wcnew.py, which
supports checking internal links (#spam fragments in URLs) and some
other options.
- Nov 1999. Sam Bayer contributed patches to reintegrate wcnew.py
into webchecker.py, and corresponding mods to wcgui.py and
websucker.py.
- Mar 2004. Chris Herborth contributed a patch to let webchecker.py
handle XHTML's 'id' attribute.
This diff is collapsed.
This diff is collapsed.
import webchecker, sys
webchecker.DEFROOT = "http://www.python.org/python/"
webchecker.MAXPAGE = 50000
webchecker.verbose = 2
sys.argv.append('-x')
webchecker.main()
sys.stdout.write("\nCR to exit: ")
sys.stdout.flush()
sys.stdin.readline()
This diff is collapsed.
#! /usr/bin/env python3
"""A variant on webchecker that creates a mirror copy of a remote site."""
__version__ = "$Revision$"
import os
import sys
import getopt
import urllib.parse
import webchecker
# Extract real version number if necessary
if __version__[0] == '$':
_v = __version__.split()
if len(_v) == 3:
__version__ = _v[1]
def main():
verbose = webchecker.VERBOSE
try:
opts, args = getopt.getopt(sys.argv[1:], "qv")
except getopt.error as msg:
print(msg)
print("usage:", sys.argv[0], "[-qv] ... [rooturl] ...")
return 2
for o, a in opts:
if o == "-q":
verbose = 0
if o == "-v":
verbose = verbose + 1
c = Sucker()
c.setflags(verbose=verbose)
c.urlopener.addheaders = [
('User-agent', 'websucker/%s' % __version__),
]
for arg in args:
print("Adding root", arg)
c.addroot(arg)
print("Run...")
c.run()
class Sucker(webchecker.Checker):
checkext = 0
nonames = 1
# SAM 11/13/99: in general, URLs are now URL pairs.
# Since we've suppressed name anchor checking,
# we can ignore the second dimension.
def readhtml(self, url_pair):
url = url_pair[0]
text = None
path = self.savefilename(url)
try:
f = open(path, "rb")
except IOError:
f = self.openpage(url_pair)
if f:
info = f.info()
nurl = f.geturl()
if nurl != url:
url = nurl
path = self.savefilename(url)
text = f.read()
f.close()
self.savefile(text, path)
if not self.checkforhtml(info, url):
text = None
else:
if self.checkforhtml({}, url):
text = f.read()
f.close()
return text, url
def savefile(self, text, path):
dir, base = os.path.split(path)
makedirs(dir)
try:
f = open(path, "wb")
f.write(text)
f.close()
self.message("saved %s", path)
except IOError as msg:
self.message("didn't save %s: %s", path, str(msg))
def savefilename(self, url):
type, rest = urllib.parse.splittype(url)
host, path = urllib.parse.splithost(rest)
path = path.lstrip("/")
user, host = urllib.parse.splituser(host)
host, port = urllib.parse.splitnport(host)
host = host.lower()
if not path or path[-1] == "/":
path = path + "index.html"
if os.sep != "/":
path = os.sep.join(path.split("/"))
path = os.path.join(host, path)
return path
def makedirs(dir):
if not dir:
return
if os.path.exists(dir):
if not os.path.isdir(dir):
try:
os.rename(dir, dir + ".bak")
os.mkdir(dir)
os.rename(dir + ".bak", os.path.join(dir, "index.html"))
except os.error:
pass
return
head, tail = os.path.split(dir)
if not tail:
print("Huh? Don't know how to make dir", dir)
return
makedirs(head)
os.mkdir(dir, 0o777)
if __name__ == '__main__':
sys.exit(main() or 0)
#! /usr/bin/env python3
"""Tkinter-based GUI for websucker.
Easy use: type or paste source URL and destination directory in
their respective text boxes, click GO or hit return, and presto.
"""
from Tkinter import *
import websucker
import os
import threading
import queue
import time
VERBOSE = 2
try:
class Canceled(Exception):
"Exception used to cancel run()."
except (NameError, TypeError):
Canceled = __name__ + ".Canceled"
class SuckerThread(websucker.Sucker):
stopit = 0
savedir = None
rootdir = None
def __init__(self, msgq):
self.msgq = msgq
websucker.Sucker.__init__(self)
self.setflags(verbose=VERBOSE)
self.urlopener.addheaders = [
('User-agent', 'websucker/%s' % websucker.__version__),
]
def message(self, format, *args):
if args:
format = format%args
##print format
self.msgq.put(format)
def run1(self, url):
try:
try:
self.reset()
self.addroot(url)
self.run()
except Canceled:
self.message("[canceled]")
else:
self.message("[done]")
finally:
self.msgq.put(None)
def savefile(self, text, path):
if self.stopit:
raise Canceled
websucker.Sucker.savefile(self, text, path)
def getpage(self, url):
if self.stopit:
raise Canceled
return websucker.Sucker.getpage(self, url)
def savefilename(self, url):
path = websucker.Sucker.savefilename(self, url)
if self.savedir:
n = len(self.rootdir)
if path[:n] == self.rootdir:
path = path[n:]
while path[:1] == os.sep:
path = path[1:]
path = os.path.join(self.savedir, path)
return path
def XXXaddrobot(self, *args):
pass
def XXXisallowed(self, *args):
return 1
class App:
sucker = None
msgq = None
def __init__(self, top):
self.top = top
top.columnconfigure(99, weight=1)
self.url_label = Label(top, text="URL:")
self.url_label.grid(row=0, column=0, sticky='e')
self.url_entry = Entry(top, width=60, exportselection=0)
self.url_entry.grid(row=0, column=1, sticky='we',
columnspan=99)
self.url_entry.focus_set()
self.url_entry.bind("<Key-Return>", self.go)
self.dir_label = Label(top, text="Directory:")
self.dir_label.grid(row=1, column=0, sticky='e')
self.dir_entry = Entry(top)
self.dir_entry.grid(row=1, column=1, sticky='we',
columnspan=99)
self.go_button = Button(top, text="Go", command=self.go)
self.go_button.grid(row=2, column=1, sticky='w')
self.cancel_button = Button(top, text="Cancel",
command=self.cancel,
state=DISABLED)
self.cancel_button.grid(row=2, column=2, sticky='w')
self.auto_button = Button(top, text="Paste+Go",
command=self.auto)
self.auto_button.grid(row=2, column=3, sticky='w')
self.status_label = Label(top, text="[idle]")
self.status_label.grid(row=2, column=4, sticky='w')
self.top.update_idletasks()
self.top.grid_propagate(0)
def message(self, text, *args):
if args:
text = text % args
self.status_label.config(text=text)
def check_msgq(self):
while not self.msgq.empty():
msg = self.msgq.get()
if msg is None:
self.go_button.configure(state=NORMAL)
self.auto_button.configure(state=NORMAL)
self.cancel_button.configure(state=DISABLED)
if self.sucker:
self.sucker.stopit = 0
self.top.bell()
else:
self.message(msg)
self.top.after(100, self.check_msgq)
def go(self, event=None):
if not self.msgq:
self.msgq = queue.Queue(0)
self.check_msgq()
if not self.sucker:
self.sucker = SuckerThread(self.msgq)
if self.sucker.stopit:
return
self.url_entry.selection_range(0, END)
url = self.url_entry.get()
url = url.strip()
if not url:
self.top.bell()
self.message("[Error: No URL entered]")
return
self.rooturl = url
dir = self.dir_entry.get().strip()
if not dir:
self.sucker.savedir = None
else:
self.sucker.savedir = dir
self.sucker.rootdir = os.path.dirname(
websucker.Sucker.savefilename(self.sucker, url))
self.go_button.configure(state=DISABLED)
self.auto_button.configure(state=DISABLED)
self.cancel_button.configure(state=NORMAL)
self.message( '[running...]')
self.sucker.stopit = 0
t = threading.Thread(target=self.sucker.run1, args=(url,))
t.start()
def cancel(self):
if self.sucker:
self.sucker.stopit = 1
self.message("[canceling...]")
def auto(self):
tries = ['PRIMARY', 'CLIPBOARD']
text = ""
for t in tries:
try:
text = self.top.selection_get(selection=t)
except TclError:
continue
text = text.strip()
if text:
break
if not text:
self.top.bell()
self.message("[Error: clipboard is empty]")
return
self.url_entry.delete(0, END)
self.url_entry.insert(0, text)
self.go()
class AppArray:
def __init__(self, top=None):
if not top:
top = Tk()
top.title("websucker GUI")
top.iconname("wsgui")
top.wm_protocol('WM_DELETE_WINDOW', self.exit)
self.top = top
self.appframe = Frame(self.top)
self.appframe.pack(fill='both')
self.applist = []
self.exit_button = Button(top, text="Exit", command=self.exit)
self.exit_button.pack(side=RIGHT)
self.new_button = Button(top, text="New", command=self.addsucker)
self.new_button.pack(side=LEFT)
self.addsucker()
##self.applist[0].url_entry.insert(END, "http://www.python.org/doc/essays/")
def addsucker(self):
self.top.geometry("")
frame = Frame(self.appframe, borderwidth=2, relief=GROOVE)
frame.pack(fill='x')
app = App(frame)
self.applist.append(app)
done = 0
def mainloop(self):
while not self.done:
time.sleep(0.1)
self.top.update()
def exit(self):
for app in self.applist:
app.cancel()
app.message("[exiting...]")
self.done = 1
def main():
AppArray().mainloop()
if __name__ == '__main__':
main()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment