使用query词抓取百度图片数据的时候,为了简化抓取,使用firebug跟踪到百度图片请求返回了一个json格式数据,例如http://image.baidu.com/i?tn=resultjson_com&ie=gbk&word=%B7%B6%B1%F9%B1%F9&cg=girl&pn=0&rn=60(该url已简化)。我们只需抓取这个json数据即可,但是这个json格式中的objectUrl和fromUrl却是加密了的数据,如ippr_z2C$qAzdH3FAzdH3Fjgp_z&e3Bvg6_z&e3BvgAzdH3FstfpAzdH3Fda8candnAzdH3FWada8candnn98c9lndadl9_z&e3B3r2。后又使用firebug发现,鼠标在点击连接时并没有向百度解密的请求,故判断解密程序在javascript中。
本来打算使用SpiderMonkey执行js来解密,但今天在网上偶然看到一段js解密url的代码,来自百度知道http://zhidao.baidu.com/link?url=APKpO_ktmLa4QE0ZYlOqKyNdLul4rkDbV-LXBeDv40sHe0yDXS-LGc4A3ArNL2RoSY4xD2Z8M_5vdmjm2nAOILhYWXE1ErtGMiPbi-paK4e,测试了下是对的,到这里这个问题总算搞定了。
解密方法很简单,秘钥是一个字符的对应关系,有2种映射:(1)多个字符映射为一个字符,'_z2C$q'=>':','_z&e3B'=>'.','AzdH3F'=>'/',(2)单个字符映射为单字符。根据这个关系可将密文解密为明文,实现并不困难。
C语言的实现:
#include <stdio.h>
#include <string.h>
// return length;
int Decode(const char *src, const char *table, char *dest) {
char *d = dest;
const char *p = src;
while (*p != 0) {
if (*p == '_') {
if (strncmp(p, "_z2C$q", 6) == 0) {
*dest++ = ':';
p += 6;
} else if (strncmp(p, "_z&e3B", 6) == 0) {
*dest++ = '.';
p += 6;
} else {
*dest++ = *p++;
}
} else if (*p == 'A') {
if (strncmp(p, "AzdH3F", 6) == 0) {
*dest++ = '/';
p += 6;
} else {
*dest++ = *p++;
}
} else if (table[*p] == 0) {
*dest++ = *p++;
} else {
*dest++ = table[*p++];
}
}
*dest = 0;
return dest - d;
}
int main(int argc, char **argv) {
static char table[128] = {0};
table['w'] = 'a';
table['k'] = 'b';
table['v'] = 'c';
table['1'] = 'd';
table['j'] = 'e';
table['u'] = 'f';
table['2'] = 'g';
table['i'] = 'h';
table['t'] = 'i';
table['3'] = 'j';
table['h'] = 'k';
table['s'] = 'l';
table['4'] = 'm';
table['g'] = 'n';
table['5'] = 'o';
table['r'] = 'p';
table['q'] = 'q';
table['6'] = 'r';
table['f'] = 's';
table['p'] = 't';
table['7'] = 'u';
table['e'] = 'v';
table['o'] = 'w';
table['8'] = '1';
table['d'] = '2';
table['n'] = '3';
table['9'] = '4';
table['c'] = '5';
table['m'] = '6';
table['0'] = '7';
table['b'] = '8';
table['l'] = '9';
table['a'] = '0';
char *url = "ippr_z2C$qAzdH3FAzdH3Ft428_z&e3Bd0_z&e3BvgAzdH3Ft4w2jfAzdH3Fda8a8aAzdH3FddAzdH3F8db00d0dnd_9amln8aa_z&e3B3r2";
char dest[1024];
Decode(url, table, dest);
printf("%s\n", dest);
return 0;
}
自己写的,不能保证效率。
输出:http://img1.27.cn/images/201010/22/1287727232_40693100.jpg
JavaScript的实现:
<script type="text/javascript">
var f = { w: "a", k: "b", v: "c", 1: "d", j: "e", u: "f", 2: "g", i: "h", t: "i", 3: "j", h: "k", s: "l", 4: "m", g: "n", 5: "o", r: "p", q: "q", 6: "r", f: "s", p: "t", 7: "u", e: "v", o: "w", 8: "1", d: "2", n: "3", 9: "4", c: "5", m: "6", 0: "7", b: "8", l: "9", a: "0", _z2C$q: ":", "_z&e3B": ".", AzdH3F: "/" };
var url = "ippr_z2C$qAzdH3FAzdH3Fjgp_z&e3Bvg6_z&e3BvgAzdH3FstfpAzdH3Fda8candnAzdH3FWada8candnn98c9lndadl9_z&e3B3r2";
var h = /(_z2C\$q|_z&e3B|AzdH3F)/g;
var e = url.replace(h, function(t, e) { return f[e] });
var s = /([a-w\d])/g;
e = e.replace(s, function(t, e) { return f[e] });
document.write(e);
</script>
输出:http://ent.cnr.cn/list/20150323/W020150323341549320294.jpg
PHP的实现:
<?PHP
$f = array('w'=>'a','k'=>'b','v'=>'c','1'=>'d','j'=>'e','u'=>'f','2'=>'g','i'=>'h','t'=>'i','3'=>'j','h'=>'k','s'=>'l','4'=>'m','g'=>'n','5'=>'o','r'=>'p','q'=>'q','6'=>'r','f'=>'s','p'=>'t','7'=>'u','e'=>'v','o'=>'w','8'=>'1','d'=>'2','n'=>'3','9'=>'4','c'=>'5','m'=>'6','0'=>'7','b'=>'8','l'=>'9','a'=>'0','_z2C$q'=>':','_z&e3B'=>'.','AzdH3F'=>'/');
$url = 'ippr_z2C$qAzdH3FAzdH3Ft4f_z&e3Bw6ptg2nmc_z&e3Bv54AzdH3Fg51jAzdH3F8da90bAzdH3F'; // 这里需要用单引号
$url = preg_replace_callback('/_z2C\$q|_z&e3B|AzdH3F/', function($matches){ global $f; return $f[$matches[0]]; }, $url);
$url = preg_replace_callback('/[a-w\d]/', function($matches){ global $f; return $f[$matches[0]]; }, $url);
echo $url;
?>