现在网上大把的Python的爬虫教程,很少看见有用C#写的,正好新出的.Net Core可以很方便的部署到Linux上,就用妹子图做示范写个小爬虫
在C#下有个很方便的类库 HtmlAgilityPack 可以用来分析网页
我们先新建一个.Net Core控制台项目MzituCrawler,在Nuget管理器添加HtmlAgilityPack的引用 Install-Package HtmlAgilityPack -Version 1.9.2
我们打开妹子图的网页,点击下面的翻页按钮,发现每页的地址有个固定的格式 https://www.mzitu.com/page/页码/
查看网页上链接的元素可以看到每个链接对应的xpath地址为 //*[@id='pins']/li/a
我们用HtmlAgilityPack获取第每一页的内容
1 int pageIndex = 1; 2 HtmlNodeCollection nodes; 3 HtmlWeb web = new HtmlWeb(); 4 do 5 { 6 var url = new Uri(new Uri(baseUrl), $"/page/{pageIndex++}/").ToString(); 7 var doc = web.Load(url); 8 nodes = doc.DocumentNode.SelectNodes("//*[@id='pins']/li/a"); 9 if (nodes.Count > 0) 10 { 11 foreach (var node in nodes) 12 { 13 var title = node.SelectSingleNode("img").GetAttributeValue("alt", string.Empty); 14 var href = node.GetAttributeValue("href", string.Empty); 15 href = new Uri(new Uri(baseUrl), href).ToString(); 16 DownloadImages(downloadFolder: Path.Combine(baseFolder, title), url: href); 17 } 18 } 19 } while (nodes != null && nodes.Count >= 24);
其中方法 DownloadImages 是下载对应链接里面图片的方法
private static void DownloadImages(string downloadFolder, string url) { if (!Directory.Exists(downloadFolder)) { Directory.CreateDirectory(downloadFolder); } HtmlWeb web = new HtmlWeb(); var pageIndex = 1; HtmlNode imageNode; do { var doc = web.Load($"{url}/{pageIndex++}"); imageNode = doc.DocumentNode.SelectSingleNode("/html/body/div[2]/div[1]/div[3]/p/a/img"); if (imageNode != null) { var imageUrl = imageNode.GetAttributeValue("src", string.Empty); imageUrl = new Uri(new Uri(url), imageUrl).ToString(); using (var client = new HttpClient()) { client.DefaultRequestHeaders.Host = "i.meizitu.net"; client.DefaultRequestHeaders.Pragma.ParseAdd("no-cache"); client.DefaultRequestHeaders.AcceptEncoding.ParseAdd("gzip, deflate"); client.DefaultRequestHeaders.AcceptLanguage.ParseAdd("zh-CN,zh;q=0.8,en;q=0.6"); client.DefaultRequestHeaders.CacheControl = new System.Net.Http.Headers.CacheControlHeaderValue { NoCache = true }; client.DefaultRequestHeaders.Connection.ParseAdd("keep-alive"); client.DefaultRequestHeaders.Referrer = new Uri(url); client.DefaultRequestHeaders.UserAgent.ParseAdd("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"); client.DefaultRequestHeaders.Accept.ParseAdd("image/webp,image/apng,image/*,*/*;q=0.8"); var buffer = client.GetByteArrayAsync(imageUrl).Result; var fileName = new Uri(imageUrl).Segments.Last(); File.WriteAllBytes(Path.Combine(downloadFolder, fileName), buffer); } } } while (imageNode != null); }