今天试着抓取一个网站的数据,发现网站加了cookie限制,浏览器一共访问三次才可以访问到主页,第一次访问会通过js修改本地cookie,然后js(加密过-_-!)会刷新本页带着更新后的cookie,第二次访问返回的头信息中再次对cookie进行更改,并进行302重定向,第三次访问才最终访问到网站数据。坑啊,抓个网站这么麻烦。
1.分析一下,第一次访问,js是加密过的,首先需要解决加密问题,用的php抓取页面,拿到的数据是这样的
HTTP/1.1 200 OK
Server: nginx
Date: Wed, 30 Mar 2016 09:43:18 GMT
Content-Type: text/html
Transfer-Encoding: chunked
Connection: keep-alive
X-Powered-By-360WZB: wangzhan.360.cn
Set-Cookie: wzwsconfirm=11bdf0ec3c162bd5bd6b2f6974a04413; path=/
<html>
<head>
</head>
<body>
<noscript>
<h1><strong>请开启JavaScript并刷新该页.</strong></h1>
</noscript>
<script type="text/javascript">
eval(function(p,a,c,k,e,d){e=function(c){return(c<a?'':e(parseInt(c/a)))+((c=c%a)>32?String.fromCharCode(c+32):c.toString(33))};if(!''.replace(/^/,String)){while(c--)d[e(c)]=k[c]||e(c);k=[function(e){return d[e]}];e=function(){return'\\w+'};c=1};while(c--)if(k[c])p=p.replace(new RegExp('\\b'+e(c)+'\\b','g'),k[c]);return p}('15 D="k";15 1a="i";15 1b="l";15 11=7;15 F = "e+/=";J g(10) {15 U, N, R;15 o, p, q;R = 10.S;N = 0;U = "";17 (N < R) {o = 10.s(N++) & 6;O (N == R) {U += F.r(o >> a);U += F.r((o & 1) << c);U += "==";n;}p = 10.s(N++);O (N == R) {U += F.r(o >> a);U += F.r(((o & 1) << c) | ((p & 5) >> c));U += F.r((p & 4) << a);U += "=";n;}q = 10.s(N++);U += F.r(o >> a);U += F.r(((o & 1) << c) | ((p & 5) >> c));U += F.r(((p & 4) << a) | ((q & 3) >> d));U += F.r(q & 2);}W U;}J H(){15 16= 19.Q||B.C.u||B.m.u;15 K= 19.P||B.C.t||B.m.t;O (16*K <= 9) {W 14;}15 1d = 19.Y;15 1e = 19.Z;O (1d + 16 <= 0 || 1e + K <= 0 || 1d >= 19.X.18 || 1e >= 19.X.M) {W 14;}W G;}J h(){15 12 = 1a+1b;15 L = 0;15 N = 0;I(N = 0; N < 12.S; N++) {L += 12.s(N);}L *= b;L += 8;W "j"+L;}J f(){O(H()) {} E {15 A = ""; A = "1c="+g(11.13()) + "; V=/";B.w = A; 15 v = h();A = "1a="+g(v.13()) + "; V=/";B.w = A; 19.T=D;}}f();',59,74,'0|0x3|0x3f|0xc0|0xf|0xf0|0xff|1|111111|120000|2|3|4|6|ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789|HXXTTKKLLPPP5|KTKY2RBD9NHPBCIHV9ZMEQQDARSLVFDU|QWERTASDFGXYSF|RANDOMSTR23640|WZWS_CONFIRM_PREFIX_LABEL1|/|STRRANDOM23640|body|break|c1|c2|c3|charAt|charCodeAt|clientHeight|clientWidth|confirm|cookie|cookieString|document|documentElement|dynamicurl|else|encoderchars|false|findDimensions|for|function|h|hash|height|i|if|innerHeight|innerWidth|len|length|location|out|path|return|screen|screenX|screenY|str|template|tmp|toString|true|var|w|while|width|window|wzwschallenge|wzwschallengex|wzwstemplate|x|y'.split('|'),0,{}))
</script>
通过网上的解密工具可以很快的还原回来js在线加解密
var dynamicurl = "/";
var wzwschallenge = "RANDOMSTR23640";
var wzwschallengex = "STRRANDOM23640";
var template = 1;
var encoderchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
function KTKY2RBD9NHPBCIHV9ZMEQQDARSLVFDU(str) {
var out, i, len;
var c1, c2, c3;
len = str.length;
i = 0;
out = "";
while (i < len) {
c1 = str.charCodeAt(i++) & 0xff;
if (i == len) {
out += encoderchars.charAt(c1 >> 2);
out += encoderchars.charAt((c1 & 0x3) << 4);
out += "==";
break;
}
c2 = str.charCodeAt(i++);
if (i == len) {
out += encoderchars.charAt(c1 >> 2);
out += encoderchars.charAt(((c1 & 0x3) << 4) | ((c2 & 0xf0) >> 4));
out += encoderchars.charAt((c2 & 0xf) << 2);
out += "=";
break;
}
c3 = str.charCodeAt(i++);
out += encoderchars.charAt(c1 >> 2);
out += encoderchars.charAt(((c1 & 0x3) << 4) | ((c2 & 0xf0) >> 4));
out += encoderchars.charAt(((c2 & 0xf) << 2) | ((c3 & 0xc0) >> 6));
out += encoderchars.charAt(c3 & 0x3f);
}
return out;
}
function findDimensions() {
var w = window.innerWidth || document.documentElement.clientWidth || document.body.clientWidth;
var h = window.innerHeight || document.documentElement.clientHeight || document.body.clientHeight;
if (w * h <= 120000) {
return true;
}
var x = window.screenX;
var y = window.screenY;
if (x + w <= 0 || y + h <= 0 || x >= window.screen.width || y >= window.screen.height) {
return true;
}
return false;
}
function QWERTASDFGXYSF() {
var tmp = wzwschallenge + wzwschallengex;
var hash = 0;
var i = 0;
for (i = 0; i < tmp.length; i++) {
hash += tmp.charCodeAt(i);
}
hash *= 3;
hash += 111111;
return "WZWS_CONFIRM_PREFIX_LABEL1" + hash;
}
function HXXTTKKLLPPP5() {
if (findDimensions()) {} else {
var cookieString = "";
cookieString = "wzwstemplate=" + KTKY2RBD9NHPBCIHV9ZMEQQDARSLVFDU(template.toString()) + "; path=/";
document.cookie = cookieString;
var confirm = QWERTASDFGXYSF();
cookieString = "wzwschallenge=" + KTKY2RBD9NHPBCIHV9ZMEQQDARSLVFDU(confirm.toString()) + "; path=/";
document.cookie = cookieString;
window.location = dynamicurl;
}
}
HXXTTKKLLPPP5();
发现是通过一定算法改变了本地的cookie,具体什么算法洒家就不知道了,于是就想可以安装一下v8js或者spidermonkey,可是奈何能力有限并不能在windows下成功安装,就想大不了自己把js代码还原成php代码啦。。。于是,终于粗来了
$dynamicurl = "/";
$wzwschallenge = "**RANDOMSTR5937**";
$wzwschallengex = "**STRRANDOM5937**";
$template = **7**;
$encoderchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
function KTKY2RBD9NHPBCIHV9ZMEQQDARSLVFDU($str,$encode)
{
$out='';
$i='';
$len='';
$c1='';
$c2='';
$c3='';
$len=strlen($str);
$i=0;
$out="";
while ($i < $len)
{
$item=substr($str, $i,1);
$c1=ord($item) & 0xff;
$i++;
if ($i == $len)
{
$out .= substr($encode,$c1>>2,1);
$out .= substr($encode,($c1 & 0x3) << 4,1);
$out .= "==";
break;
}
$item=substr($str, $i,1);
$c2=ord($item);
$i++;
if ($i == $len)
{
$out .= substr($encode,$c1>>2,1);
$out .= substr($encode,(($c1 & 0x3) << 4) | (($c2 & 0xf0) >> 4),1);
$out .= substr($encode,($c2 & 0xf) << 2,1);
$out .= "=";
break;
}
$item=substr($str, $i,1);
$c3=ord($item);
$i++;
$out .= substr($encode,$c1>>2,1);
$out .= substr($encode,(($c1 & 0x3) << 4) | (($c2 & 0xf0) >> 4),1);
$out .= substr($encode,(($c2 & 0xf) << 2) | (($c3 & 0xc0) >> 6),1);
$out .= substr($encode,$c3 & 0x3f,1);
}
return $out;
}
function QWERTASDFGXYSF($wzwschallenge,$wzwschallengex)
{
$tmp =$wzwschallenge . $wzwschallengex;
$hash = 0;
$i = 0;
for ($i = 0; $i < strlen($tmp); $i++) {
$item=substr($tmp, $i,1);
$item=ord($item);
$hash += $item;
}
$hash *= **19**;
$hash += 111111;
return "**WZWS_CONFIRM_PREFIX_LABEL7**" .$hash;
}
第一个函数用于生成cookie,第二个函数用于生成js代码里的那个confirm,函数名称是对应的。加星号的地方是每次请求都会变化的地方。好啦只剩下解密了,于是分析了一下网上在线解密工具的js源码
function decode() {
var code = document.getElementById('code').value;
code2 = code.replace(/^eval/, '');
//alert(code);
document.getElementById('code').value = eval(code2);
}
坑爹啊有木有,就一行就搞定了,分析了一下才明白,eval函数会运行js代码,将加密后的js代码在还原回来,可是这个eval函数是js的呀,php倒也有类似的函数但是不起作用,我需要在服务器端执行。于是又想通过php还原解密步骤,小心脏受不了最终放弃了。不过倒是发现一个笨方法。在洒家的观察下发现,解密过程实际上只是动态生成了一下js代码,参数是从下面这段区域获取的
0|0x3|0x3f|0xc0|0xf|0xf0|0xff|1|111111|120000|2|**3**|4|6|ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789|HXXTTKKLLPPP5|KTKY2RBD9NHPBCIHV9ZMEQQDARSLVFDU|QWERTASDFGXYSF|**RANDOMSTR23640**|**WZWS_CONFIRM_PREFIX_LABEL1**|/|**STRRANDOM23640**|body|break|c1|c2|c3|charAt|charCodeAt|clientHeight|clientWidth|confirm|cookie|cookieString|document|documentElement|dynamicurl|else|encoderchars|false|findDimensions|for|function|h|hash|height|i|if|innerHeight|innerWidth|len|length|location|out|path|return|screen|screenX|screenY|str|template|tmp|toString|true|var|w|while|width|window|wzwschallenge|wzwschallengex|wzwstemplate|x|y'.split('|'
用‘|’分割每个参数或者是函数名或者是参数值,反正是由这些组成解密后的代码。这就好办了通过字符串匹配很快变找到了我需要的参数。加醒的是有用的,需要将这些参数传给php中的对应的函数里。匹配规则就不贴出来了,这样通过自己模仿的函数便可以得出来下次访问需要携带的cookie
wzwstemplate=Mg==;
wzwschallenge=V1pXU19DT05GSVJNX1BSRUZJWF9MQUJFTDIxMjA3MzE=;
wzwsconfirm=11bdf0ec3c162bd5bd6b2f6974a04413;
终于携带者cookie我们成功进行了第二次访问,第二次访问就好说多了,直接根据header的setcookie进行一下设置,我们便可以获得网站内容了。
总结一下:要分析类似的网站首先要会用浏览器的开发者工具选项,记得清除该域下的cookie,和缓存,这样能更好的帮助你分析。
其次要学会分析js代码,静下心来慢慢分析,就会发现并没有想象中的那么复杂。
当然这是比较笨的方法,大牛还是会开心的安装一个js引擎,这样至少不用每次都去翻译js代码了。洒家也正在学习,等洒家安装成功了,再写一遍博客,记录下一下。