re 正则表达式爬取网站标题

import requests
import re

url = 'http://www.jd.com/'
#url='http://www.eastmoney.com/'
r=requests.get(url)
r.encoding='utf-8'
data=re.findall('<title>(.*?)</title>',r.text,re.S)
print(data)

['京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物！']

['东方财富网：财经门户，提供专业的财经、股票、行情、证券、基金、理财、银行、保险、信托、期货、黄金、股吧、博客等各类财经资讯及数据']

 
        import  
        re 
       
        # 提取python 
       
        key  
        =  
        "javapythonc++php" 
       
        re.findall( 
        "python" 
        , key)[ 
        0 
        ] 
       
        """ 
       
        python 
       
        """ 
       
        # 提取出hello world 
       
        key  
        =  
        "<html><h1>hello world</h1></html>" 
       
        re.findall( 
        '<h1>hello world</h1>' 
        , key) 
       
        """ 
       
        ['<h1>hello world</h1>'] 
       
        """ 
       
        # 提取170 
       
        string  
        =  
        "我喜欢身高为170的女孩" 
       
        # re.findall("170", string)[0] 
       
        re.findall( 
        '\d+' 
        , string) 
       
        """ 
       
        ['170'] 
       
        """ 
       
        # 提取出http://和https:// 
       
        key  
        =  
        'http://www.baidu.com and https://boob.com' 
       
        re.findall( 
        'https{0,1}' 
        , key)    
        # {}前的字符出现0次或1次 
       
        """ 
       
        ['http', 'https'] 
       
        """ 
       
        # 提取出hit. 
       
        key  
        =  
        "[email protected]" 
       
        re.findall( 
        "h.*\." 
        , key)    
        # .表示任意字符(\n除外)；*表示匹配0个或多个；\表示对.转义 
       
        """ 
       
        ['hit.edu.'] 
       
        """ 
       
        # 贪婪模式：根据正则表达式尽可能多地提取数据。 
       
        # 切换为非贪婪模式,加一个"?" 
       
        re.findall( 
        "h.*?\." 
        , key) 
       
        """ 
       
        ['hit.'] 
       
        """ 
       
        # 匹配sas和saas 
       
        key  
        =  
        "saas and sas and saaas" 
       
        re.findall( 
        'sa{1,2}s' 
        , key)     
        # 匹配1-2次由前面表达式定义的片段 
       
        """ 
       
        ['saas', 'sas'] 
       
        """ 
       
        # 匹配i开头的行  re.S:基于单行匹配    re.M:基于多行匹配 
       
        string  
        =  
        '''fall in love with you 
       
        i love you very much 
       
        i love she 
       
        i love her''' 
       
        re.findall( 
        "^i.*" 
        , string, re.M) 
       
        """ 
       
        ['i love you very much', 'i love she', 'i love her'] 
       
        """ 
       
        # 匹配所有的行 
       
        string  
        =  
        """<div>静夜思 
       
        床前明月光 
       
        疑是地上霜 
       
        举头望明月 
       
        低头思故乡 
       
        </div>""" 
       
        re.findall( 
        '<div>.*</div>' 
        , string, re.S) 
       
        """ 
       
        ['<div>静夜思\n床前明月光\n疑是地上霜\n举头望明月\n低头思故乡\n</div>'] 
       
        """

re 正则表达式 爬取网站标题

猜你喜欢

re 正则表达式爬取网站标题