# -*- coding: utf-8 -*-
from urllib import request
import re
def re_geturl(mytext, index):
p1 = r"http.*D" + str(index) # 正则表达式规则
http_list = re.findall(p1, mytext)
http_list = list(map(lambda x: x.replace("amp;", ""), http_list))
http_list = list(set(http_list)) # 去重
for url in http_list:
print(url)
def get_prestige_hd():
"""
读取网页中的url
:return: url_list
"""
url = "http://fxxxx.com/forum.php?mod=forumdisplay&fid=131&page="
for i in range(1, 10):
cur_url = url + str(i)
print(cur_url)
response = request.urlopen(cur_url) # 打开连接
html = response.read()
html = html.decode("utf-8")
re_geturl(html, i)
break
return ""
if __name__ == "__main__":
get_prestige_hd()
爬虫入门(1)
猜你喜欢
转载自blog.csdn.net/linchaoa1989/article/details/82956864
今日推荐
周排行