LLM应用实战: AI资讯的自动聚合及报告生成
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
import json
from typing import Dict, Any, Union, List
from bs4 import BeautifulSoup
from file_util import *
import os
import datetime
import re
import requests
class AbstractAICrawler():
def __init__(self) -> None:
pass
def crawl():
raise NotImplementedError()
class AINewsCrawler(AbstractAICrawler):
def __init__(self, domain) -> None:
super().__init__()
self.domain = domain
self.file_path = f'data/{self.domain}.json'
self.history = self.init()
def init(self):
if not os.path.exists(self.file_path):
return {}
return {ele['id']: ele for ele in get_datas(self.file_path)}
def save(self, datas: Union[List, Dict]):
if isinstance(datas, dict):
datas = [datas]
self.history.update({ele['id']: ele for ele in datas})
save_datas(self.file_path, datas=list(self.history.values()))
async def crawl(self, url:str, schema: Dict[str, Any]=None):
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) if schema else None
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url=url,
extraction_strategy=extraction_strategy,
bypass_cache=True,
)
assert result.success, "Failed to crawl the page"
if schema:
return json.loads(result.extracted_content)
return result.cleaned_html
class AIBasesCrawler(AINewsCrawler):
def __init__(self) -> None:
self.domain = 'aibase'
super().__init__(self.domain)
self.url = 'https://www.aibase.com'
async def crawl_home(self, url='https://www.aibase.com/news'):
schema = {
'name': 'ai base home page crawler',
'baseSelector': '.flex',
'fields': [
{
'name': 'link',
'selector': 'a[rel="noopener noreferrer"]',
'type': 'nested_list',
'fields': [
{'name': 'href', 'type': 'attribute', 'attribute':'href'}
]
}
]
}
links = await super().crawl(url, schema)
links = [link['href'] for ele in links for link in ele['link']]
links = list(set([f'{self.url}{ele}' for ele in links if ele.startswith('/news')]))
links = sorted(links, key=lambda x: x, reverse=True)
return links
async def crawl_newsletter_cn(self, url):
html = await super().crawl(url)
body = BeautifulSoup(html, 'html.parser')
title = body.select_one('h1').get_text().replace('\u200b', '').strip()
date = [ele.get_text().strip() for ele in body.find_all('span') if re.match(r'(\d{4}年\d{1,2}月\d{1,2}号)', ele.get_text().strip())][0]
date = datetime.datetime.strptime(date, '%Y年%m月%d号 %H:%M').strftime("%Y-%m-%d")
content = '\n'.join([ele.get_text().strip().replace('\n', '').replace(' ', '') for ele in body.find_all('p')])
content = content[:content.index('划重点:')].strip() if '划重点:' in content else content
return {
'title': title,
'link': url,
'content': content,
'date': date
}
async def crawl_home_cn(self, url='https://www.aibase.com/zh/news'):
schema = {
'name': 'ai base home page crawler',
'baseSelector': '.flex',
'fields': [
{
'name': 'link',
'selector': 'a[rel="noopener noreferrer"]',
'type': 'nested_list',
'fields': [
{'name': 'href', 'type': 'attribute', 'attribute':'href'}
]
}
]
}
links = await super().crawl(url, schema)
links = [link['href'] for ele in links for link in ele['link']]
links = list(set([f'{self.url}{ele}' for ele in links if ele.startswith('/zh/news')]))
links = sorted(links, key=lambda x: x, reverse=True)
return links
async def crawl_newsletter(self, url):
html = await super().crawl(url)
body = BeautifulSoup(html, 'html.parser')
title = body.select_one('h1').get_text().replace('\u200b', '').strip()
date = ';'.join([ele.get_text().strip() for ele in body.find_all('span')])
date = re.findall(r'(\b\w{3}\s+\d{1,2},\s+\d{4}\b)', date)[0]
date = datetime.datetime.strptime(date, '%b %d, %Y').strftime("%Y-%m-%d")
content = '\n'.join([ele.get_text().strip().replace('\n', '') for ele in body.find_all('p')])
content = content[:content.index('Key Points:')].strip() if 'Key Points:' in content else content
pic_urls = [ele.get('src').strip() for ele in body.select('img') if ele.get('title')]
pic_url = pic_urls[0] if pic_urls else ''
pic_url = pic_url.replace('\\"', '')
pic_path = ''
if pic_url:
pic_path = f'data/images/{md5(url)}.jpg'
response = requests.get(pic_url)
if response.status_code == 200:
with open(pic_path, 'wb') as f:
f.write(response.content)
return {
'title': title,
'link': url,
'content': content,
'date': date,
'pic': pic_path,
'id': md5(url)
}
async def crawl(self):
links = await self.crawl_home()
results = []
for link in links:
_id = md5(link)
if _id in self.history:
continue
results.append({
'id': _id,
'link': link,
'contents': await self.crawl_newsletter(link),
'time': datetime.datetime.now().strftime('%Y-%m-%d')
})
self.save(results)
return await self.get_last_day_data()
async def get_last_day_data(self):
last_day = (datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')
datas = self.init()
for v in datas.values():
v['contents']['id'] = v['id']
return [v['contents'] for v in datas.values() if v['contents']['date'] == last_day]