python tutorial 29 -- whole site crawler

commonsense knowledge 
Baidu reptile The core of whole grabbing: touch the melon with the vine

track down sb. by following clues:
1. Climbing entrance
2. Basic rules of climbing
1. What is the content under the domain name
2. The content to be crawled should be repeated: the same content cannot be crawled repeatedly; website update
3. Establish a basic recursion; level depth

3. When establishing recursion, consider when to turn off recursion

Single threaded version

#coding=utf-8
import urllib
import re

#
r = re.compile(r'href="(http://www\.cnpythoner\.com.*?)"')

#
def get_urls_and_save_from_contents(url):
    try:
        opener = urllib.urlopen(url)

        contents = opener.read()
        g = r.finditer(contents) #Generate iteratable objects. Only when you iterate can you get them out
        opener.close()

        save_contents_from_url(url, contents)
        return g
    except:
        return []

#
def save_contents_from_url(url, contents):
    filename = url.replace("http://", "")
    filename = filename.replace(".", "_")
    filename = filename.replace("/", "|")

    opene = open("/Users/jianan/Documents/cnpythoner_data/%s" % filename, "w")
    opene.write(contents)
    opene.close()
    return

#
def get_and_save_url(url, data_cache, i):
    urls = get_urls_and_save_from_contents(url)
    for url in urls:
        url = url.groups()[0]
        if url not in data_cache:
            data_cache.append(url)
        else:
            continue
        i += 1
        print i, url
        # print len(data_cache)
        get_and_save_url(url, data_cache, i)

#
data_cache = []
i = 0
get_and_save_url("http://www.cnpythoner.com/", data_cache, i)
Multithreaded version
#coding=utf-8
import urllib
import re
import threading
import Queue

q = Queue.Queue()
mylock = threading.RLock()

#
r = re.compile(r'href="(http://www\.cnpythoner\.com.*?)"')

#
urls = []

#
def save_contents_from_url(url, contents):
    filename = url.replace("http://", "")
    filename = filename.replace(".", "_")
    filename = filename.replace("/", "|")

    opene = open("/Users/jianan/Documents/cnpythoner_data/%s" % filename, "w")
    opene.write(contents)
    opene.close()
    return


def set_urls_from_contents(contents):
    g = r.finditer(contents)
    mylock.acquire()
    for url in g:
        url = url.groups()[0]
        print url
        if url in urls:
            continue
        else:
            urls.append(url)
            q.put(url)
    mylock.release()

def save_contents():
    while True:
        url = q.get()
        try:
            opener = urllib.urlopen(url)
            contents = opener.read()
            opener.close()
            set_urls_from_contents(contents)
            save_contents_from_url(url, contents)
        except:
            continue


#
q.put("http://www.cnpythoner.com")

ts = []

for i in range(1, 100):
    t = threading.Thread(target=save_contents)
    t.start()
    ts.append(t)

for t in ts:
    t.join()




Posted on Sat, 02 May 2020 07:47:33 -0700 by Snatch