LLM应用实战: AI资讯的自动聚合及报告生成

from crawl4ai import AsyncWebCrawler from crawl4ai.extraction_strategy import JsonCssExtractionStrategy import json from typing import Dict, Any, Union, List from bs4 import BeautifulSoup from file_util import * import os import datetime import re import requests class AbstractAICrawler(): def __init__(self) -> None: pass def crawl(): raise NotImplementedError() class AINewsCrawler(AbstractAICrawler): def __init__(self, domain) -> None: super().__init__() self.domain = domain self.file_path = f'data/{self.domain}.json' self.history = self.init() def init(self): if not os.path.exists(self.file_path): return {} return {ele['id']: ele for ele in get_datas(self.file_path)} def save(self, datas: Union[List, Dict]): if isinstance(datas, dict): datas = [datas] self.history.update({ele['id']: ele for ele in datas}) save_datas(self.file_path, datas=list(self.history.values())) async def crawl(self, url:str, schema: Dict[str, Any]=None): extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) if schema else None async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( url=url, extraction_strategy=extraction_strategy, bypass_cache=True, ) assert result.success, "Failed to crawl the page" if schema: return json.loads(result.extracted_content) return result.cleaned_html class AIBasesCrawler(AINewsCrawler): def __init__(self) -> None: self.domain = 'aibase' super().__init__(self.domain) self.url = 'https://www.aibase.com' async def crawl_home(self, url='https://www.aibase.com/news'): schema = { 'name': 'ai base home page crawler', 'baseSelector': '.flex', 'fields': [ { 'name': 'link', 'selector': 'a[rel="noopener noreferrer"]', 'type': 'nested_list', 'fields': [ {'name': 'href', 'type': 'attribute', 'attribute':'href'} ] } ] } links = await super().crawl(url, schema) links = [link['href'] for ele in links for link in ele['link']] links = list(set([f'{self.url}{ele}' for ele in links if ele.startswith('/news')])) links = sorted(links, key=lambda x: x, reverse=True) return links async def crawl_newsletter_cn(self, url): html = await super().crawl(url) body = BeautifulSoup(html, 'html.parser') title = body.select_one('h1').get_text().replace('\u200b', '').strip() date = [ele.get_text().strip() for ele in body.find_all('span') if re.match(r'(\d{4}年\d{1,2}月\d{1,2}号)', ele.get_text().strip())][0] date = datetime.datetime.strptime(date, '%Y年%m月%d号 %H:%M').strftime("%Y-%m-%d") content = '\n'.join([ele.get_text().strip().replace('\n', '').replace(' ', '') for ele in body.find_all('p')]) content = content[:content.index('划重点:')].strip() if '划重点:' in content else content return { 'title': title, 'link': url, 'content': content, 'date': date } async def crawl_home_cn(self, url='https://www.aibase.com/zh/news'): schema = { 'name': 'ai base home page crawler', 'baseSelector': '.flex', 'fields': [ { 'name': 'link', 'selector': 'a[rel="noopener noreferrer"]', 'type': 'nested_list', 'fields': [ {'name': 'href', 'type': 'attribute', 'attribute':'href'} ] } ] } links = await super().crawl(url, schema) links = [link['href'] for ele in links for link in ele['link']] links = list(set([f'{self.url}{ele}' for ele in links if ele.startswith('/zh/news')])) links = sorted(links, key=lambda x: x, reverse=True) return links async def crawl_newsletter(self, url): html = await super().crawl(url) body = BeautifulSoup(html, 'html.parser') title = body.select_one('h1').get_text().replace('\u200b', '').strip() date = ';'.join([ele.get_text().strip() for ele in body.find_all('span')]) date = re.findall(r'(\b\w{3}\s+\d{1,2},\s+\d{4}\b)', date)[0] date = datetime.datetime.strptime(date, '%b %d, %Y').strftime("%Y-%m-%d") content = '\n'.join([ele.get_text().strip().replace('\n', '') for ele in body.find_all('p')]) content = content[:content.index('Key Points:')].strip() if 'Key Points:' in content else content pic_urls = [ele.get('src').strip() for ele in body.select('img') if ele.get('title')] pic_url = pic_urls[0] if pic_urls else '' pic_url = pic_url.replace('\\"', '') pic_path = '' if pic_url: pic_path = f'data/images/{md5(url)}.jpg' response = requests.get(pic_url) if response.status_code == 200: with open(pic_path, 'wb') as f: f.write(response.content) return { 'title': title, 'link': url, 'content': content, 'date': date, 'pic': pic_path, 'id': md5(url) } async def crawl(self): links = await self.crawl_home() results = [] for link in links: _id = md5(link) if _id in self.history: continue results.append({ 'id': _id, 'link': link, 'contents': await self.crawl_newsletter(link), 'time': datetime.datetime.now().strftime('%Y-%m-%d') }) self.save(results) return await self.get_last_day_data() async def get_last_day_data(self): last_day = (datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d') datas = self.init() for v in datas.values(): v['contents']['id'] = v['id'] return [v['contents'] for v in datas.values() if v['contents']['date'] == last_day]
