背景
前段时间,项目需要实现一个功能:根据已提供的数据(网站URL和对应的错别字)。对网页进行错别字高亮显示并自动截图取证。
经历
起初,因为一些原因,临时用了cdp4j+Robot结合,通过Ctrl + F,然后Ctrl + C/V来模拟手动页面查找进行截图。
弊端:
1.服务器上面需要安装Chrome;
2.截图时,需要当前Chrome窗口保持在顶层,否则会将需要查找的错别字,粘贴到其它地方;
3.由于第二条的限制,所以没法启动多个线程同时截图;
部分代码:
@Override
public void run(){
Launcher launcher = null;
Robot robot = null;
OutputStream out = null;
Session session = null;
try{
launcher = new Launcher();
SessionFactory factory = launcher.launch();
session = factory.create();
session.navigate("需要截图的网站URL");
session.waitDocumentReady(30000);//最多等待30秒渲染
robot = new Robot();
robot.keyPress(KeyEvent.VK_CONTROL);
robot.keyPress(KeyEvent.VK_F);
robot.keyRelease(KeyEvent.VK_F);
robot.keyRelease(KeyEvent.VK_CONTROL);
Clipboard clipboard = Toolkit.getDefaultToolkit().getSystemClipboard();
Transferable transferable = new StringSelection("需要查找的错别字");
clipboard.setContents(transferable, null);
robot.keyPress(KeyEvent.VK_CONTROL);
robot.keyPress(KeyEvent.VK_V);
robot.keyRelease(KeyEvent.VK_V);
robot.keyRelease(KeyEvent.VK_CONTROL);
robot.keyPress(KeyEvent.VK_ENTER);
robot.keyRelease(KeyEvent.VK_ENTER);
byte[] png = session.captureScreenshot();
if(png!=null && png.length>0){
File file = new File(filePath);
if(!file.mkdirs()){
file = new File(filePath);
}
String path = filePath + "/"+UUID.randomUUID() + ".png";
file = new File(path);
out = new FileOutputStream(file);
out.write(png);
out.flush();
//自己的业务
.....
}
} catch (Exception e) {
e.printStackTrace();
}finally {
if(session!=null){
session.close();
}
//关闭当前窗口
//if(robot!=null){
// robot.keyPress(KeyEvent.VK_CONTROL);
// robot.keyPress(KeyEvent.VK_W);
// robot.keyRelease(KeyEvent.VK_W);
// robot.keyRelease(KeyEvent.VK_CONTROL);
//}
//结束进程
if(launcher!=null){
launcher.getProcessManager().kill();
}
//关闭流
if(out!=null){
try {
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
目前方法
利用phantomjs进行截图。需要两个东西:1.phantomjs.exe ; 2.js脚本(此处的screenshot.js)
好处:
1.phantomjs是无头浏览器,截图过程中,不会弹出浏览器页面,操作服务器不会造成影响;
Phantomjs官网 。 下载并将exe文件解压到指定目录即可。
2.可以多个线程同时截图;
3.可以截长图;
java代码:
@Override
public void run() {
String projectPath = Thread.currentThread().getContextClassLoader().getResource("").getPath();
String needPath = projectPath.substring(1, projectPath.length() - 16);
String path = needPath + "phantomjs/";
String file_name = UUID.randomUUID() + ".jpg";
String new_file = AppConfig.WORDSSCREENSHOT_DIR + "/" + file_name;//截图保存路径
String phantomjsExePath = path + "phantomjs.exe";//phantomjs在项目中的路径
String codejsPath = path + "screenshot.js";//js的路径
Runtime rt = Runtime.getRuntime();
Process process = null;
InputStream is = null;
try {
//url为网站URL,word 需要查找的错别字;每个参数必须用空格隔开
process = rt.exec(phantomjsExePath + " " + codejsPath + " "
+ url.trim() + " " + new_file + " " + word);
is = process.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(is));
StringBuffer sbf = new StringBuffer();
String tmp = "";
while ((tmp = br.readLine()) != null) {
sbf.append(tmp);
}
is.close();
System.out.println("url:" + url + " -->截图结束");
File screenshot = new File(new_file);
if (screenshot.exists()) {//截图成功
//自己的业务
...
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (process != null) {
process.destroy();
}
}
}
js代码:
/**
* Created by RYK on 2018/5/24.
*/
var page = require('webpage').create();
system = require('system');
page.viewportSize = {
width : 1024,
height : 800
};
page.settings = {
javascriptEnabled : true,
loadImages : true,
userAgent : 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) PhantomJS/20.0',
resourceTimeout:60*1000
};
var address,filePath,value;
if(system.args.length < 4){
phantom.exit();
}else{
address = system.args[1];//网站URL
filePath = system.args[2];//图片保存路径
value = system.args[3];//需要查找的错别字
page.open(address, function (status){
if (status != "success"){
console.log('FAIL to load the address');
phantom.exit();
}
page.evaluate(function(s){
window.scrollTo(0,10000);//滚动到页面底部
window.setTimeout(function(){//将页面上匹配到的文字,加上背景色并进行替换
var body = document.body;
var contents = body.innerHTML;
var reg = new RegExp(s, 'g')
contents = contents.replace(reg, '<span style="background:yellow;">' + s + '</span>')
document.body.innerHTML = contents
},1000);
},value);
window.setTimeout(function (){
page.render(filePath);//截图保存
phantom.exit();
}, 5000);
});
}
不足
测试时,单个网站截图用时较长。原因不详。正式环境可以用多线程弥补。
说明
这是我的第一篇文章,多多指正。
感谢前端同事指导的高亮显示。