前两天朋友叫我模仿一个网站,刚刚开始,我一个页面一个页面查看源码并复制和保存,花了我很多时间,一个字“累”,为了减轻工作量,我写了个网站“克隆工具”,一键克隆,比起人工操作,
效率提高了200%以上,精确度也大大提高,虽然网上也很多网站克隆工具,但我觉得作为一个程序员,要有点研究精神,哈哈哈,可以根据自己的需要随意编写自己需要的功能。
下面我将我写的“网站克隆工具”实现方法分享给大家,源码在文末有下载链接,有需要的朋友可以下载来玩,也可以根据自己的需要做相应的修改或优化。
一睹为快,先看看界面:
简单的工作流程:
项目代码目录结构:
下面一步步实现程序功能:
1.新建主界面窗体(MainForm.cs):
2.新建模型类(UrlModel.cs)
1 2 3 4 5 6 7 8 9 10 11 |
|
3.新建服务类(Services)
UrlParser:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
|
WebPageService:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
|
4.网页源码扒取类
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
|
5.网站克隆主类
接口:
interface IWebCloneWorker { void Start(); void Cancel(); }
实现类:
public class WebCloneWorker : IWebCloneWorker { //网站页面克隆深度(如:0-首页,1-分类页,2-详细页面) public static int depth = 0; //要克隆的网站网址 public string Url { get; set; } //克隆后,保存的路径 public string SavePath { get; set; } private BackgroundWorker backgroundWorker1 = null; public event UrlChangedEventHandler UrlChanged; public event FileSavedSuccessEventHandler FileSavedSuccess; public event FileSavedFailEventHandler FileSavedFail; public event DownloadCompletedEventHandler DownloadCompleted; public event CollectingUrlEventHandler CollectingUrl; public event CollectedUrlEventHandler CollectedUrl; public event ProgressChangedEventHandler ProgressChanged; //所有页面、文件资源地址集合 private Dictionary<string, UrlModel> _Hrefs = new Dictionary<string, UrlModel>(); /// <summary> /// 所有页面、文件资源地址集合 /// </summary> public Dictionary<string,UrlModel> Hrefs { get { return _Hrefs; } set { _Hrefs = value; } } //网站页面请求编码,默认为UTF-8 private string _Encoding = "utf-8"; //网站页面请求编码,默认为UTF-8 public string Encoding { get { return _Encoding; } set { _Encoding = value; } } public WebCloneWorker() { } public WebCloneWorker(string url,string path) { //设置网站、保存路径 this.Url = url; this.SavePath = path; if (string.IsNullOrEmpty(this.Url)) throw new Exception("请输入网址"); if (string.IsNullOrEmpty(this.SavePath)) throw new Exception("请选择要保存的目录"); backgroundWorker1 = new BackgroundWorker(); //设置报告进度更新 backgroundWorker1.WorkerReportsProgress = true; backgroundWorker1.WorkerSupportsCancellation = true; //注册线程主体方法 backgroundWorker1.DoWork += backgroundWorker1_DoWork; //注册更新UI方法 backgroundWorker1.ProgressChanged += backgroundWorker1_ProgressChanged; //处理完毕 backgroundWorker1.RunWorkerCompleted += backgroundWorker1_RunWorkerCompleted; } void backgroundWorker1_RunWorkerCompleted(object sender, RunWorkerCompletedEventArgs e) { if (e.Cancelled) { return; } if (this.DownloadCompleted != null) { DownloadCompletedEventArgs eventArgs = new DownloadCompletedEventArgs(e.Result, e.Error, e.Cancelled); this.DownloadCompleted(this, eventArgs); } } void backgroundWorker1_ProgressChanged(object sender, ProgressChangedEventArgs e) { //进度回调 if (this.ProgressChanged != null) this.ProgressChanged(this, e); UrlModel model = (UrlModel)e.UserState; if (this.UrlChanged != null) { //Url改变后,回调 UrlChangedEventArgs eventArgs = new UrlChangedEventArgs(model); this.UrlChanged(this, eventArgs); } try { string dir = this.SavePath; string url = model.AbsoluteUri; string AbsolutePath = url.Substring(url.IndexOf('/', 8)); string fileName = ""; if (url.IndexOf('?') > 0) { string path = AbsolutePath.Substring(0, model.RelatedPath.IndexOf('?')); fileName = System.IO.Path.GetFileName(path); } else { fileName = System.IO.Path.GetFileName(AbsolutePath); } //默认首页 if (string.IsNullOrEmpty(fileName) || fileName.IndexOf(".") < 0) { fileName = "index.html"; if (!AbsolutePath.EndsWith("/")) AbsolutePath = AbsolutePath + "/"; } fileName = System.Web.HttpUtility.UrlDecode(fileName); string localPath = string.Format("{0}{1}", dir, System.IO.Path.GetDirectoryName(AbsolutePath)); if (!System.IO.Directory.Exists(localPath)) { System.IO.Directory.CreateDirectory(localPath); } //判断文件是否存在,存在不再下载 string path2 = Path.Combine(localPath, fileName); if (File.Exists(path2)) { return; } //下载网页、图片、资源文件 HttpTool.DownFile(url, localPath, fileName); //保存成功后,回调 if (this.FileSavedSuccess != null) { FileSavedSuccessEventArgs eventArgs = new FileSavedSuccessEventArgs(model); this.FileSavedSuccess(this, eventArgs); } } catch (Exception ex) { //保存失败后,回调 if (this.FileSavedFail != null) { FileSavedFailEventArgs eventArgs = new FileSavedFailEventArgs(ex); this.FileSavedFail(this, eventArgs); } } } void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e) { //获取资源 GetResource(); int index = 1; if (this.Hrefs.Keys.Count > 0) { foreach (var k in this.Hrefs.Keys) { //取消操作 if (backgroundWorker1.CancellationPending) { e.Cancel = true; return; } backgroundWorker1.ReportProgress(index, this.Hrefs[k]); index++; //挂起当前线程200毫秒 Thread.Sleep(200); } } } public void Start() { if (this.backgroundWorker1.IsBusy) return; this.backgroundWorker1.RunWorkerAsync(); } public void Cancel() { if (this.backgroundWorker1.CancellationPending) return; this.backgroundWorker1.CancelAsync(); } private void GetResource() { string url = this.Url; string referer = this.Url; string msg = ""; string html = HttpTool.HttpGet(url, referer, this.Encoding, out msg); //收集页面链接 GetHrefs(0, url, html); //收集完毕 if (null != CollectedUrl) { UrlModel urlModel = new UrlModel(); CollectedUrlEventArgs eventArgs = new CollectedUrlEventArgs(urlModel); this.CollectedUrl(this, eventArgs); } } private void GetHrefs(int level,string url,string html) { #region 添加当前页 UrlModel currUrl = UrlParser.Parse(url); try { //取消 if (backgroundWorker1.CancellationPending) return; this.Hrefs.Add(currUrl.RelatedPath, currUrl); //收集回调 if (null != CollectingUrl) { CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(currUrl); this.CollectingUrl(this, eventArgs); } } catch { } #endregion //获取相关链接(含有href属性的) List<UrlModel> list1 = WebPageService.GetLocalHrefs(url,html); //获取图片,文件等资源文件(含有src属性的) List<UrlModel> listSrcs = WebPageService.GetLocalSrcs(url,html); #region 获取当级资源文件 if (listSrcs != null) { for (int i = 0; i < listSrcs.Count; i++) { UrlModel urlModel = listSrcs[i]; try { //取消 if (backgroundWorker1.CancellationPending) return; this.Hrefs.Add(urlModel.RelatedPath, urlModel); //收集回调 if (null != CollectingUrl) { CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(urlModel); this.CollectingUrl(this, eventArgs); } } catch { } } } #endregion #region 获取子级页面资源 //获取第二级 if (list1 != null) { for (int i = 0; i < list1.Count; i++) { UrlModel urlModel = list1[i]; try { //取消 if (backgroundWorker1.CancellationPending) return; this.Hrefs.Add(urlModel.RelatedPath, urlModel); //收集回调 if (null != CollectingUrl) { CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(urlModel); this.CollectingUrl(this, eventArgs); } } catch { } string msg = ""; html = HttpTool.HttpGet(urlModel.AbsoluteUri, urlModel.AbsoluteUri, this.Encoding, out msg); #region 获取子级资源文件 /* * 获取二级资源文件 * */ listSrcs = WebPageService.GetLocalSrcs(urlModel.AbsoluteUri, html);//资源文件 if (listSrcs != null) { for (int j = 0; j < listSrcs.Count; j++) { UrlModel urlModel2 = listSrcs[j]; try { //取消 if (backgroundWorker1.CancellationPending) return; this.Hrefs.Add(urlModel2.RelatedPath, urlModel2); //收集回调 if (null != CollectingUrl) { CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(urlModel2); this.CollectingUrl(this, eventArgs); } } catch { } //挂起线程20毫秒 Thread.Sleep(20); } } #endregion //挂起线程20毫秒 Thread.Sleep(20); //到达指定深度后,退出 if (level >= depth) return; //递归 GetHrefs(level + 1, urlModel.AbsoluteUri, html); } } #endregion } }
6.一些事件、委托类:
public delegate void UrlChangedEventHandler(object sender, UrlChangedEventArgs e); public delegate void FileSavedSuccessEventHandler(object sender, FileSavedSuccessEventArgs e); public delegate void FileSavedFailEventHandler(object sender, FileSavedFailEventArgs e); public delegate void DownloadCompletedEventHandler(object sender, DownloadCompletedEventArgs e); public delegate void CollectingUrlEventHandler(object sender, CollectingUrlEventArgs e); public delegate void CollectedUrlEventHandler(object sender, CollectedUrlEventArgs e); public delegate void ProgressChangedEventHandler(object sender, ProgressChangedEventArgs e);
public class CollectedUrlEventArgs : EventArgs public class CollectingUrlEventArgs : EventArgs public class DownloadCompletedEventArgs : EventArgs public class FileSavedFailEventArgs : EventArgs public class FileSavedSuccessEventArgs : EventArgs public class UrlChangedEventArgs : EventArgs
代码有点多,各位有需要的还是下载源码查看并运行吧,由于赶时间,没时间仔细测试程序的各个功能,难免有不足的地方。
百度网盘:链接:https://pan.baidu.com/s/1hja1rl9UEcl0dzTqVFt0dg 密码:7s6r
大家好,我算是老司机了,一直从事编程工作十多年,欢迎编程界的朋友,一起学习,一起交流!活到老学到老。。。。。。。。。。。。。。。。。。。。。。
http://www.cgpwyj.cn/
http://news.cgpwyj.cn/
http://item.cgpwyj.cn/
http://www.peacemind.com.cn/
http://news.peacemind.com.cn/
http://item.peacemind.com.cn/
http://www.tasknet.com.cn/
http://news.tasknet.com.cn/
http://item.tasknet.com.cn/
http://www.ownbar.cn/
http://news.ownbar.cn/
http://item.ownbar.cn
http://www.shtarchao.net.cn/
http://news.shtarchao.net.cn/
http://item.shtarchao.net.cn/
http://www.metroworld.com.cn/
http://news.metroworld.com.cn/
http://www.cngodo.cn/
http://news.cngodo.cn/
http://item.cngodo.cn/
http://www.gzrdbp.cn/
http://news.gzrdbp.cn/
http://item.gzrdbp.cn/
http://www.dnapt.cn/
http://news.dnapt.cn/
http://item.dnapt.cn/
http://www.ncxlk.cn/
http://news.ncxlk.cn/
http://item.ncxlk.cn/
http://www.zgxxyp.cn/
http://news.zgxxyp.cn/
http://item.zgxxyp.cn/
http://www.sjjdvr.cn/
http://news.sjjdvr.cn/
http://item.sjjdvr.cn/
http://www.sujinkeji.cn/
http://news.sujinkeji.cn/
http://item.sujinkeji.cn/
http://www.zsjxbd.cn/
http://news.zsjxbd.cn/
http://item.zsjxbd.cn/
http://www.yesgas.cn/
http://news.yesgas.cn/
http://item.yesgas.cn/
http://www.quickpass.sh.cn/
http://news.quickpass.sh.cn/
http://item.quickpass.sh.cn/
http://www.jspcrm.cn/
http://news.jspcrm.cn/
http://item.jspcrm.cn/
http://www.yjdwpt.cn/
http://news.yjdwpt.cn/
http://item.yjdwpt.cn/
http://www.henanwulian.cn/
http://news.henanwulian.cn/
http://item.henanwulian.cn/
http://www.hhrshh.cn/
http://news.hhrshh.cn/
http://item.hhrshh.cn/
http://www.gpgold.cn/
http://news.gpgold.cn/
http://item.gpgold.cn/
http://www.jingzhuiyou.cn/
http://news.jingzhuiyou.cn/
http://item.jingzhuiyou.cn/