async for 在爬虫中的使用例子

2024-02-01 09:04:10
import asyncio

import re

import typing

from concurrent.futures import Executor, ThreadPoolExecutor

from urllib.request import urlopen

DEFAULT_EXECUTOR = ThreadPoolExecutor(4)
ANCHOR_TAG_PATTERN = re.compile(b"<a.+?href=[\"|\'](.*?)[\"|\'].*?>", re.RegexFlag.MULTILINE | re.RegexFlag.IGNORECASE)


async def wrap_async(generator: typing.Generator,
                     executor: Executor = DEFAULT_EXECUTOR,
                     sentinel=None,
                     *,
                     loop: asyncio.AbstractEventLoop = None):
    """
    We wrap a generator and return an asynchronous generator instead
    :param iterator:
    :param executor:
    :param sentinel:
    :param loop:
    :return:
    """

    if not loop:
        loop = asyncio.get_running_loop()

    while True:
        # 相当于执行next(generator)
        result = await loop.run_in_executor(executor, next, generator, sentinel)
        if result == sentinel:
            # 如果链接为空跳出
            break
        yield result


def follow(*links):
    """
    :param links:
    :return:
    """

    return ((link, urlopen(link).read()) for link in links)


def get_links(text: str):
    """
    Get back an iterator that gets us all the links in a text iteratively and safely
    :param text:
    :return:
    """

    # Always grab the last match, because that is how a smart http parser would interpret a malformed
    # anchor tag
    return (match.groups()[-1]
            for match in ANCHOR_TAG_PATTERN.finditer(text)
            # This portion is a safeguard against None matches and zero href matches
            if hasattr(match, "groups") and len(match.groups()))


async def main(*links):
    async for current, body in wrap_async(follow(*links)):
        print("Current url:", current)
        print("Content:", body)
        async for link in wrap_async(get_links(body)):
            print(link)


asyncio.run(main("https://www.cnblogs.com/c-x-a"))
码农公寓

相关文章