import urlparse from os import sep, unlink, makedirs, rmdir from os.path import splitext, dirname, isdir, exists import urllib import urllib2 from htmllib import HTMLParser from formatter import AbstractFormatter, DumbWriter from cStringIO import StringIO from string import replace, find, lower, index from sys import argv import shutil class Retrieve(object): def __init__(self, url): self.url = url self.fileName = self.getFileName(url) self.user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1)' def getFileName(self, url, defaultName = 'index.html'): parseurl = urlparse.urlparse(url, 'http:', False) path = parseurl[1] + parseurl[2] ext = splitext(path) if ext[1] == '': if path[-1] == '/': path += defaultName else: path += '/' + defaultName ldir = dirname(path) if not isdir(ldir): if exists(ldir): unlink(ldir) totalDir = '' while True: try: sepIndex = index(ldir, '/') totalDir += ldir[0 : sepIndex] if not isdir(totalDir): if exists(totalDir): unlink(totalDir) makedirs(totalDir) totalDir += '/' ldir = ldir[sepIndex + 1:] except ValueError: totalDir += ldir makedirs(totalDir) break return path def download(self): try: headers = {'User-Agent' : self.user_agent} req = urllib2.Request(self.url, headers = headers) response = urllib2.urlopen(req) retval = response.readlines() f = open(self.fileName, 'w') for str in retval: f.write(str) f.close() except IOError: retval = '***' return retval def parseAndGetLinks(self): self.htmlParse = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.htmlParse.feed(open(self.fileName).read()) self.htmlParse.close() return self.htmlParse.anchorlist class Crawler(object): def __init__(self, url): self.url = url self.urlQueue = [url] self.urlSeenQueue = [] self.domain = urlparse.urlparse(url)[1] if isdir(self.domain): shutil.rmtree(self.domain) def getPage(self, url): r = Retrieve(url) retVal = r.download() if retVal[0] == '*': return urls = r.parseAndGetLinks() for urlOne in urls: if urlOne[:4] != 'http' and find(urlOne, '://') == -1: urlOne = urlparse.urljoin(url, urlOne) if find(lower(urlOne), 'mailto:') != -1: continue if urlOne not in self.urlSeenQueue: if find(urlOne, self.domain) == -1: continue if (find(urlOne, '#comments') != -1): continue if (find(urlOne, 'li2818') == -1): continue if urlOne not in self.urlQueue and urlOne not in self.urlSeenQueue: self.urlQueue.append(urlOne) self.urlSeenQueue.append(url) def testUseful(self, url): fUrl = urllib.urlopen(url) hCode = fUrl.getcode() if hCode != 200: return False return True def go(self): while self.urlQueue: url = self.urlQueue.pop() #if self.testUseful(url) == False: # continue s = 'seen url' + url print s self.getPage(url) def printSeen(self): f = open('already_seen_url', 'w') while self.urlSeenQueue: f.write(self.urlSeenQueue.pop() + '\n') def main(): #if len(argv) > 1: # url = argv[1] #else: # try: # url = raw_input('start with one url: ') # except(KeyboardInterrupt, EOFError): # url = '' #if not url: # return #crawler = Crawler(url) crawler = Crawler('http://blog.csdn.net/li2818') #crawler = Crawler('http://www.hao123.com') #crawler = Crawler('http://blog.csdn.net') crawler.go() crawler.printSeen() print 'done!' if __name__ == '__main__': main()
一个简单的爬虫程序,包含请求头。
猜你喜欢
转载自blog.csdn.net/li2818/article/details/73135941
今日推荐
周排行