(2)抓取系统
common_urllib.py
#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import os import urllib import urllib2 import traceback import json import logging import types import re import common_logging logger = logging.getLogger() def get(url): content = u'' try: status = urllib.urlopen(url) if status.getcode() == 200: content = status.read() content = unicode(content.strip(), 'utf-8', 'ignore') else: logger.error('fetch error [%s]' % url) except: logger.error('fetch error %s' % traceback.format_exc()) return content def post(url, dict_data={}): content = u'' try: data = urllib.urlencode(dict_data) req = urllib2.Request(url, data) status = urllib2.urlopen(req) if status.getcode() == 200: content = status.read() content = unicode(content.strip(), 'utf-8', 'ignore') else: logger.error('fetch error [%s]' % url) except: logger.error('%s' % traceback.format_exc()) return content def post_content(url, data): content = u'' try: req = urllib2.Request(url, data) status = urllib2.urlopen(req) if status.getcode() == 200: content = status.read() else: logger.error('fetch error [%s]' % url) except: logger.error('%s' % traceback.format_exc()) return content if __name__ == '__main__': content = get("http://www.sina.com") print len(content)