#!/usr/bin/python # -*- coding: UTF-8 -*- import sys import zlib import base64 import re from bs4 import BeautifulSoup import sys reload(sys) sys.setdefaultencoding('utf-8') def splitoffpage(s, headers, contenttypes, contents): p = 0 headersize = 0 storesizes = [] originalsizes = [] while True: q = s.find('\n', p) if q < 0: return False s1 = s[p:q] s2 = s1.strip() if len(s2) == 0: headersize = q + 1 break headers.append(s2) vp = s2.find("Content-Type: ") if vp == 0: value = s2[vp + len("Content-Type: "): ] a = value.split(';') for ai in a: kv = ai.strip().split(',') if len(kv) == 2: contenttype, storesize = kv[0], kv[1].strip() contenttypes.append(contenttype) storesizes.append(int(storesize)) vp = s2.find("Original-Size: ") if vp == 0: value = s2[vp + len("Original-Size: "): ] a = value.split(';') i = 0 for ai in a: kv = ai.strip().split(',') if len(kv) == 2: contenttype, originalsize = kv[0], kv[1].strip() if contenttype == contenttypes[i]: originalsizes.append(int(originalsize)) else: return False i += 1 else: break p = q + 1 offset = headersize for i in range(0, len(contenttypes)): storesize = storesizes[i] originalsize = originalsizes[i] nextoffset = offset + storesize contentcontent = s[offset:nextoffset] if originalsize > storesize: contentcontent = zlib.decompress(contentcontent) contents.append(contentcontent) offset = nextoffset def parse_66law_case(headers, contenttypes, contents): snapshot = contents[1] url = headers[0] pattern_ = re.compile("https?://www\.66law\.cn/(\w+)/\d+\.aspx") pattern = pattern_.search(url) if pattern is not None and pattern.group(1) == "case": title_case = re.compile('<div class="refer-box">.*?<h2>(.+?)</h2>', re.S | re.U) tit = title_case.search(snapshot) if tit is not None: title_ = tit.group(1) title = title_.strip() if title != '': title = re.sub('\s', '', title) else: title = 'null' else: title = 'null' content_case = re.compile('<p class="det">(.+?)</p>', re.S | re.U) cont = content_case.search(snapshot) if cont is not None: content_ = cont.group(1) content = content_.strip() if content != '': content = re.sub('\s', '', content) else: content = 'null' else: content = 'null' replay = re.findall('<div class="refer-diag refer-diag-l">(.+?)</div>', snapshot, re.S | re.U) answer_count = len(replay) if answer_count > 0: ans_case_1 = re.findall('<i class="arrow">(.+)', replay[0], re.S | re.U) ans_case_2 = re.search('<p>(.+?)</p>', ans_case_1[0], re.U|re.S) if ans_case_2 is not None: answer = ans_case_2.group(1) answer = answer.strip() if answer != '': answer = re.sub('\s', '', answer) else: answer = 'null' else: answer = 'null' else: answer = 'null' answer_count = str(answer_count) img, up_count, comment_count = 'null', 'null', 'null' need_string = '\t'.join([url, title, content, answer, img, answer_count, up_count, comment_count]) print need_string def parse_66law_question(headers, contenttypes, contents): snapshot = contents[1] url = headers[0] pattern_ = re.compile("https?://www\.66law\.cn/(\w+)/\d+\.aspx") pattern = pattern_.search(url) if pattern is not None and pattern.group(1) == "question": title_question = re.compile('<p class="f24 lh32">(.*?)</p>', re.S) content_question = re.compile('<p class="mt10 f18 lh32 s-c6">(.*?)</p>', re.S) answer_ques_1 = re.compile('<ul class="reply-list"(.*?)</ul>', re.S) answer_ques_11 = re.compile('<ul class="reply-list reply-list2">(.*?)</ul>', re.S) answer_ques_2 = re.compile('<p class="b">(.*?)</p>', re.S) title_question_2 = re.compile('<span class="f18">(.*?)</span>', re.S) content_question_2 = re.compile('<p class="f14 lh24 s-c666">(.*?)</p>', re.S) answer_ques_22 = re.compile('<p class="f14 lh26">(.*?)</p>', re.S) tit_1 = title_question.search(snapshot) tit_2 = title_question_2.search(snapshot) if tit_1 is not None: title_ = tit_1.group(1) title = title_.strip() if title != '': title = re.sub('\s', '', title) else: title = 'null' elif tit_2 is not None: title_ = tit_2.group(1) title = title_.strip() if title != '': title = re.sub('\s', '', title) else: title = 'null' else: title = 'null' cont_1 = content_question.search(snapshot) cont_2 = content_question_2.search(snapshot) if cont_1 is not None: content_ = cont_1.group(1) content = content_.strip() if content != '': content = re.sub('\s', '', content) else: content = 'null' elif cont_2 is not None: soup = BeautifulSoup(cont_2.group(1), "html.parser") cont = soup.get_text() cont = str(cont).strip() if len(cont) > 0: content = re.sub('\s', '', cont) else: content = 'null' else: content = 'null' ans_1 = answer_ques_1.search(snapshot) ans_11 = answer_ques_11.search(snapshot) ans_2 = answer_ques_22.findall(snapshot) if ans_1 is not None: answer_ = answer_ques_2.findall(ans_1.group(1)) answer_count = len(answer_) if answer_count > 0: answer = answer_[0] answer = answer.strip() if answer != '': answer = re.sub('\s', '', answer) else: answer = 'null' else: answer = 'null' elif ans_11 is not None: answer_ = answer_ques_2.findall(ans_11.group(1)) answer_count = len(answer_) if answer_count > 0: answer = answer_[0] answer = answer.strip() if answer != '': answer = re.sub('\s', '', answer) else: answer = 'null' else: answer = 'null' elif len(ans_2) > 0: answer_count = len(ans_2) answer = ans_2[0] answer = answer.strip() if answer != '': answer = re.sub('\s', '', answer) else: answer = 'null' else: answer = 'null' answer_count = 0 answer_count = str(answer_count) img, up_count, comment_count = 'null', 'null', 'null' need_string = '\t'.join([url, title, content, answer, img, answer_count, up_count, comment_count]) print need_string def parse_66law_answer(headers, contenttypes, contents): snapshot = contents[1] url = headers[0] pattern_ = re.compile("https?://www\.66law\.cn/question/(\w+)/\d+\.html") pattern = pattern_.search(url) if pattern is not None and pattern.group(1) == "answer": title_ans = re.compile('<span class="f18">(.+?)</span>') content_ans = re.compile('<p class="f14 lh24 s-c666">(.+?)</p>') answer_ans = re.compile('<p class="f14 lh26">(.+?)</p>') tit = title_ans.search(snapshot) if tit is not None: title_ = tit.group(1) title_ = title_.strip() if title_ != '': title = re.sub('\s', '', title_) else: title = 'null' else: title = 'null' cont = content_ans.search(snapshot) if cont is not None: content_ = cont.group(1) content_ = content_.strip() if content_ != '': content = re.sub('\s', '', content_) else: content = 'null' else: content = 'null' ans = answer_ans.search(snapshot) pattern_answercount = re.compile('(<div class="cont-box")') if ans is not None: answer_ = ans.group(1) answer_ = answer_.strip() if answer_ == '': answer = 'null' else: answer = re.sub('\s', '', answer_) answer_count = len(pattern_answercount.findall(snapshot))-1 else: answer = 'null' answer_count = 0 answer_count = str(answer_count) img, up_count, comment_count = 'null', 'null', 'null' need_string = '\t'.join([url, title, content, answer, img, answer_count, up_count, comment_count]) print need_string for line in sys.stdin: tmp = base64.b64decode(line) headers = [] contenttypes = [] contents = [] try: splitoffpage(tmp, headers, contenttypes, contents) except: continue parse_66law_case(headers, contenttypes, contents)
页面数据提取(2)
猜你喜欢
转载自blog.csdn.net/zhangye_2017/article/details/88055265
今日推荐
周排行