PHP多进程爬虫

<?php

/**
 * 多进程
 * 注意:
 * 1.php 安装pcntl扩展
 * 2.composer require fabpot/goutte --prefer-dist
 *
 */
function downloadImage($url, $path=‘/www/images/‘)
  {

    //echo $url . PHP_EOL;
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);
    $file = curl_exec($ch);
    curl_close($ch);
    saveAsImage($url, $file, $path);
  }

 function saveAsImage($url, $file, $path)
  {
    //$filename = pathinfo($url, PATHINFO_BASENAME);
    $filename = $path.parse_url($url)[‘path‘];
    // echo ‘FILENAME: ‘.$filename.PHP_EOL;
    $real_dir = pathinfo($filename, PATHINFO_DIRNAME);
    // echo ‘DIR NAME IS : ‘. $real_dir.PHP_EOL;
    if(!is_dir($real_dir)){
        if(!mkdir($real_dir, 0777,true)){

            echo ‘MAKE DIR  ‘.$real_dir.‘FAIL!!!‘.PHP_EOL;

        };
    }
    $resource = fopen($filename, ‘a‘);
    fwrite($resource, $file);
    fclose($resource);
  }

include __DIR__ . ‘/vendor/autoload.php‘;

use Goutte\Client;

$client = new Client();

$links = [
    ‘http://www.nipic.com/topic/show_27192_1‘,
    ‘http://www.nipic.com/topic/show_27054_1‘,
    ‘http://www.nipic.com/topic/show_27085_1‘,
];

$pids = [];
$dir = ‘/www/du‘;
foreach ($links as $url) {
    $pid = pcntl_fork();
    switch ($pid) {
    case -1:
        die("Fork failed\n");
    case 0:

        $id = posix_getpid();
        echo "Create child process $id success~\n";
        $data = [];

        for($i = 0;$i<10;$i++){

            $url_arr = explode(‘_‘,$url);
            $url_arr[count($url_arr)-1] = intval($url_arr[count($url_arr)-1] )+ $i;
            $url = implode(‘_‘,$url_arr).‘.html‘;

            $crawler = $client->request(‘GET‘, $url);
            $crawler->filter(‘.search-works-thumb‘)->each(function($node) use ($client, $id,$dir, &$data) {
                $url = $node->link()->getUri();

                $crawler = $client->request(‘GET‘, $url);
                $crawler->filter(‘#J_worksImg‘)->each(function($node) use ($id,$dir, &$data) {
                    $src = $node->image()->getUri();

                    $data[$id][] = $src;
                    downloadImage($src,$dir);
                });
            });
        }

        print_r($data);

        exit;

        break;
    default:
        $pids[$pid] = $pid;
        var_dump($pids);
        break;
    }
}

while ( count($pids) ) {
    if (($id = pcntl_wait($status, WUNTRACED)) > 0) {
        echo "child process $id is exit.\n";
        unset($pids[$id]);
    }
}

echo "Done\n";

  

PHP多进程爬虫

上一篇:rsync和scp上传文件的区别


下一篇:react-jsx语法上使用switch匹配不同渲染组件