<?php
//usage php vk.php config_path offset limit
//example of config_path
/*
https://vk.com/realmadrid_news
https://vk.com/i.madrid.real
https://vk.com/news_madridista
https://vk.com/live.cristiano
https://vk.com/live_madrid
https://vk.com/realmadrid_rf
https://vk.com/i_madridista_fans
https://vk.com/rm_the_best_club
https://vk.com/realmadridkz
*/
if (!isset($argv[1])) {
echo 'required to special file path that include the urls of club to scrap';
return;
}
$offset = 0;
$limit = 0;
//specify the offset and limit to choice a part of all urls to scrap
if (isset($argv[2])) {
$offset = (int) $argv[2];
}
if (isset($argv[3])) {
$limit = (int) $argv[3];
}
$lines = file($argv[1]);
if($limit==0){
$limit=count($lines);
}else{
$limit+=$offset;
}
if ($lines === false) {
echo 'file does not exist';
return;
}
foreach ($lines as $line_num=>$url) {
if($line_num<$offset || $line_num>=$limit){
// echo "{$line_num} {$offset} {$limit} \n";
continue;
}
$url=trim($url);
$file_name = trim(str_replace("https://vk.com/", '', $url)) . '.txt';
echo "start {$url} $file_name\n";
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
$header = array('user-agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36');
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($curl, CURLOPT_COOKIE, 'remixstid=733344069_79104f06be6d06c989; remixlhk=afd63ce76580029ac7; remixdt=18000; remixtst=8e8fdb0a; remixsid=c213ff63618ee65ea76fac08c73a725d8d4b02f06dbaf8b7918af; remixlang=18; remixflash=29.0.0; remixscreen_depth=24; tmr_detect=0%7C1521453754938; remixseenads=0');
$content = curl_exec($curl);
if (curl_errno($curl)) {
echo 'connect error ' . curl_error($curl) . '\n';
return;
}
if (preg_match_all('/<a href="\/search\?c\[section\]=people&c\[group\]=(\d+)".+<span class="header_count fl_l">(.+)<\/span>.+<\/a>/msU', $content, $matches)) {
$club_id = $matches[1][0];
echo "club_id {$club_id} ";
$members_count = (int) str_replace(',', '', $matches[2][0]);
echo "members_count {$members_count} \n";
//to get vk user ids of the club memmbers
curl_setopt($curl, CURLOPT_URL, 'https://vk.com/al_page.php');
curl_setopt($curl, CURLOPT_POST, true);
$page_offset =0;
while ($page_offset< $members_count) {
$post_data = array(
'act' => 'box',
'al' => 1,
'al_ad' => 0,
'offset' => $page_offset == 0 ? null : $page_offset,
'oid' => '-' . $club_id,
'tab' => 'members',
);
curl_setopt($curl, CURLOPT_POSTFIELDS, $post_data);
$content = curl_exec($curl);
if (preg_match_all('/data-id="(\d+)"/', $content, $matches)) {
foreach ($matches[1] as $index => $vkUserId) {
if ($index == 0 && $page_offset == 0) {
file_put_contents($file_name, $vkUserId . "\r\n");
} else {
file_put_contents($file_name, $vkUserId . "\r\n", FILE_APPEND);
}
}
}
$page_offset += 60;
}
} else {
echo "it's failed to get club_id and members_count";
}
curl_close($curl);
}
//usage php vk.php config_path offset limit
//example of config_path
/*
https://vk.com/realmadrid_news
https://vk.com/i.madrid.real
https://vk.com/news_madridista
https://vk.com/live.cristiano
https://vk.com/live_madrid
https://vk.com/realmadrid_rf
https://vk.com/i_madridista_fans
https://vk.com/rm_the_best_club
https://vk.com/realmadridkz
*/
if (!isset($argv[1])) {
echo 'required to special file path that include the urls of club to scrap';
return;
}
$offset = 0;
$limit = 0;
//specify the offset and limit to choice a part of all urls to scrap
if (isset($argv[2])) {
$offset = (int) $argv[2];
}
if (isset($argv[3])) {
$limit = (int) $argv[3];
}
$lines = file($argv[1]);
if($limit==0){
$limit=count($lines);
}else{
$limit+=$offset;
}
if ($lines === false) {
echo 'file does not exist';
return;
}
foreach ($lines as $line_num=>$url) {
if($line_num<$offset || $line_num>=$limit){
// echo "{$line_num} {$offset} {$limit} \n";
continue;
}
$url=trim($url);
$file_name = trim(str_replace("https://vk.com/", '', $url)) . '.txt';
echo "start {$url} $file_name\n";
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
$header = array('user-agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36');
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($curl, CURLOPT_COOKIE, 'remixstid=733344069_79104f06be6d06c989; remixlhk=afd63ce76580029ac7; remixdt=18000; remixtst=8e8fdb0a; remixsid=c213ff63618ee65ea76fac08c73a725d8d4b02f06dbaf8b7918af; remixlang=18; remixflash=29.0.0; remixscreen_depth=24; tmr_detect=0%7C1521453754938; remixseenads=0');
$content = curl_exec($curl);
if (curl_errno($curl)) {
echo 'connect error ' . curl_error($curl) . '\n';
return;
}
if (preg_match_all('/<a href="\/search\?c\[section\]=people&c\[group\]=(\d+)".+<span class="header_count fl_l">(.+)<\/span>.+<\/a>/msU', $content, $matches)) {
$club_id = $matches[1][0];
echo "club_id {$club_id} ";
$members_count = (int) str_replace(',', '', $matches[2][0]);
echo "members_count {$members_count} \n";
//to get vk user ids of the club memmbers
curl_setopt($curl, CURLOPT_URL, 'https://vk.com/al_page.php');
curl_setopt($curl, CURLOPT_POST, true);
$page_offset =0;
while ($page_offset< $members_count) {
$post_data = array(
'act' => 'box',
'al' => 1,
'al_ad' => 0,
'offset' => $page_offset == 0 ? null : $page_offset,
'oid' => '-' . $club_id,
'tab' => 'members',
);
curl_setopt($curl, CURLOPT_POSTFIELDS, $post_data);
$content = curl_exec($curl);
if (preg_match_all('/data-id="(\d+)"/', $content, $matches)) {
foreach ($matches[1] as $index => $vkUserId) {
if ($index == 0 && $page_offset == 0) {
file_put_contents($file_name, $vkUserId . "\r\n");
} else {
file_put_contents($file_name, $vkUserId . "\r\n", FILE_APPEND);
}
}
}
$page_offset += 60;
}
} else {
echo "it's failed to get club_id and members_count";
}
curl_close($curl);
}