#!/usr/bin/python # """Web Crawler. This library gets HTML from the web and gets URLs from the HTML. usage (as library): from crawler import crawl urls = crawl("http://example.com/") usage (as command): crawler url1 url2... > list """ # # Copyright (c) 2005 Satoshi Fukutomi . # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # import re import urllib from socket import getdefaulttimeout, setdefaulttimeout from urlparse import urljoin, urldefrag __all__ = ["ceawl"] agent = None class Agent(urllib.FancyURLopener): pass def _initialize(): if getdefaulttimeout() is None: setdefaulttimeout(20) agent_version = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)" ua = urllib.URLopener.version urllib.URLopener.version = agent_version global agent agent = Agent() urllib.URLopener.version = ua _initialize() def crawl(url): """Get URL and return URLs.""" urls = [] haslink = re.compile(r"(href|src)=[\"']?([^\"'<> ]+)", re.I) f = agent.open(url) for line in f: while True: found = haslink.search(line) if found: u = found.group(2) u = u.replace("&", "&") u = urldefrag(u)[0] u = urljoin(url, u) urls.append(u) line = line[found.end():] else: break urls.sort() buf = [] prev = "" for i in urls: if prev != i: buf.append(i) prev = i return buf def main(): import sys for i in sys.argv[1:]: if i.find("://") < 0: i = "http://" + i for j in crawl(i): print j if __name__ == "__main__": main()