curl是利用URL语法在命令行方式下工作的开源文件传输工具
本文在php中实现了的一个curl批处理的实例。
代码如下:
header("Content-Type:text/html;charset=utf8"); /* 先获取两个页面的所有a标签 */
// 初始化两个简单处理句柄
$ch1 = curl_init();
$ch2 = curl_init();
curl_setopt_array($ch1,array(
CURLOPT_URL => 'http://www.sina.com.cn',
CURLOPT_HEADER => 0,
CURLOPT_RETURNTRANSFER => 1,
));
curl_setopt_array($ch2,array(
CURLOPT_URL => 'http://www.baidu.com/',
CURLOPT_HEADER => 0,
CURLOPT_RETURNTRANSFER => 1,
)); // 初始化批处理句柄,并添加简单处理句柄
$mh = curl_multi_init();
curl_multi_add_handle($mh,$ch1);
curl_multi_add_handle($mh,$ch2); // 初始化执行状态
$state = null; // 执行批处理
do{
$mc = curl_multi_exec($mh,$state);
}while($mc == CURLM_CALL_MULTI_PERFORM);
while($mc == CURLM_OK && $state) {
while (curl_multi_exec($mh, $state) === CURLM_CALL_MULTI_PERFORM);
// 经过实验,发现curl_multi_select($mh)总是返回-1,意味着一下代码不会执行
if(curl_multi_select($mh) != -1) {
do{
$mc = curl_multi_exec($mh,$state);
}while($mc == CURLM_CALL_MULTI_PERFORM);
}
} // 获取内容
$text = curl_multi_getcontent($ch1);
$text .= curl_multi_getcontent($ch2); // 找到页面中所有的a标签,保存到$matches
$matches = null;
preg_match_all("/<a.*?href\s*?=\s*?[\'\"](.*?)[\'\"].*?>(.*?)<\/a>/",$text,$matches); // 关闭各个句柄
curl_multi_remove_handle($mh,$ch1);
curl_multi_remove_handle($mh,$ch2);
curl_multi_close($mh); /*在找到的连接中继续查找title标签 */ $handle = array(); // 存储简单处理句柄的数组
$mhandle = curl_multi_init(); //批处理句柄
// 处理100个页面
foreach(array_slice($matches[1],0,100) as $href) {
$tmp_h = curl_init();
curl_setopt_array($tmp_h,array(
CURLOPT_URL => $href,
CURLOPT_HEADER => 0,
CURLOPT_RETURNTRANSFER => 1,
));
curl_multi_add_handle($mhandle,$tmp_h);
$handle[] = $tmp_h;
}
do{
$mrc = curl_multi_exec($mhandle,$active);
}while($mrc == CURLM_CALL_MULTI_PERFORM);
while($mrc == CURLM_OK && $active) {
while(curl_multi_exec($mhandle,$active) == CURLM_CALL_MULTI_PERFORM);
if(curl_multi_select($mhandle) != -1) {
do{
$mrc = curl_multi_exec($mhandle,$active);
}while($mrc == CURLM_CALL_MULTI_PERFORM);
}
} // 获取这些页面的内容
$mtext = null;
foreach($handle as $tmp_h) {
$mtext .= curl_multi_getcontent($tmp_h);
curl_multi_remove_handle($mhandle, $tmp_h);
}
$mmatches = array();
preg_match_all("/<title>(.*?)<\/title>/",$mtext, $mmatches); // 编码转换
mb_detect_order('GB2312,GBK,BIG5,GB18030,UNICODE ,CP936');
foreach($mmatches[1] as $key => $val) {
$encoding = mb_detect_encoding($val);
if($encoding != 'UTF-8' && $encoding != 'CP936' && $encoding != 'GB18030' && $encoding !='') {
$mmatches[1][$key] = iconv($encoding,'UTF-8//IGNORE',$val);
}
} // 打印title信息
var_dump($mmatches[1]); // 关闭批处理句柄
curl_multi_close($mhandle);