爬取财富500强的内容
from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
url = "http://www.fortunechina.com/fortune500/c/2020-08/10/content_372148.htm"
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}
ret=Request(url,headers=headers)
html=urlopen(ret)
bs=BeautifulSoup(html,"html.parser")
tr=bs.find('tbody').find_all('tr')
#listall=[]
for j in tr[0:]:
td=j.find_all('td')
number=td[0].get_text().strip()
names=td[1].get_text().strip()
income=td[2].get_text().strip()
profit=td[3].get_text().strip()
country=td[4].get_text().strip()
list_1 = number, income, profit, country, names
list_2 = '{0:<20}\t{1:<20}\t{2:<20}\t{3:<20}\t{4:<20}'.format(number, income, profit, country, names,chr(12288))
print(list_2)
效果图
爬取豆瓣TOP250
from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
for i in range(0,250,25):
url="https://movie.douban.com/top250?start={0}&filter=".format(i)
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}
ret=Request(url,headers=headers)
html=urlopen(ret)
bs=BeautifulSoup(html,"html.parser")
names=bs.findAll('span',{'class':"title"})
scores=bs.findAll('span',{'class':"rating_num"})
numbers=bs.findAll('em',{'class':""})
number_list=[]
name_list=[]
score_list=[]
for name in names:
name=name.get_text()
if name[1] != '/':
name_list.append(name)
for number,name,score in zip(numbers,name_list,scores):
score=score.get_text()
number=number.get_text()
print(number,name,score)
效果图
爬取中国大学
from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
url="http://www.shanghairanking.cn/rankings/bcur/2020"
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}
ret=Request(url,headers=headers)
html=urlopen(ret)
bs=BeautifulSoup(html,"html.parser")
#tbody-tr-td
tr=bs.find('tbody').find_all('tr')
#listall=[]
for j in tr[0:]:
td=j.find_all('td')
number=td[0].get_text().strip()
college=td[1].get_text().strip()
province=td[2].get_text().strip()
type=td[3].get_text().strip()
score=td[4].get_text().strip()
#list_1=[]
list_1=number,college,province,type,score
#listall.append(list_1)
print(number,college,province,type,score)
效果图
爬取中国大学+网站
from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
url="http://www.shanghairanking.cn/rankings/bcur/2020"
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}
ret=Request(url,headers=headers)
html=urlopen(ret)
bs=BeautifulSoup(html,"html.parser")
tr=bs.find('tbody').find_all('tr')
#listall=[]
for j in tr[0:]:
td=j.find_all('td')
number=td[0].get_text().strip()
college=td[1].get_text().strip()
province=td[2].get_text().strip()
type=td[3].get_text().strip()
score=td[4].get_text().strip()
a=j.find_all('a')
wangzhi=a[0].get('href')
link='http://www.shanghairanking.cn{0}'.format(wangzhi)
list_1=number,college,province,type,score,link
print(number,college,province,score,link)
效果图
爬取列维坦风景油画
import os
from urllib.request import Request, urlopen
import bs4
from urllib.request import urlretrieve
url='https://www.sohu.com/a/286956359_301394'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
ret = Request(url=url, headers=header)
html = urlopen(ret)
bs = bs4.BeautifulSoup(html, 'html.parser')
pictures=bs.find('article',{'class':"article"}).find_all('img')
i=1
for picture in pictures:
picture = picture.attrs['src']
dir = os.path.abspath('C:/Users/86186/picture')
work_path = os.path.join(dir,'picture{}.jpeg').format(i)
i=i+1
urlretrieve(picture, work_path)
效果图
爬取前程无忧网
from urllib.request import Request, urlopen
import bs4
import requests
import re
import json
import xlwt
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('前程无忧招聘信息')
worksheet.write(0, 0, label='工作名称')
worksheet.write(0, 1, label='公司名称')
worksheet.write(0, 2, label='工作地区')
worksheet.write(0, 3, label='公司属性')
worksheet.write(0, 4, label='职位要求')
worksheet.write(0, 5, label='职责要求')
z = 1
for x in range(1,6):
url=('https://search.51job.com/list/080200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,'+str(x))+'.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
ret = Request(url=url, headers=header)
html = urlopen(ret)
bs = bs4.BeautifulSoup(html, 'html.parser')
names=bs.find_all('script', type="text/javascript")
for name in names:
t = name.get_text()
if len(t) > 0:
t = t.replace('window.__SEARCH_RESULT__ = ', '')
x = json.loads(t)
y = x["engine_search_result"]
for j in range(1, len(y)):
company_names = y[j]['company_name']
job_names = y[j]['job_name']
salarys = y[j]['providesalary_text']
hrefs = y[j]['job_href']
worksheet.write(z, 0, y[j]['job_name'])
worksheet.write(z, 1, y[j]['company_name'])
worksheet.write(z, 2, y[j]['workarea_text'])
worksheet.write(z, 3, y[j]['companytype_text'])
worksheet.write(z, 4, y[j]['attribute_text'])
urls = hrefs
infos = Request(urls, headers=header)
htmls = urlopen(infos)
bs2 = bs4.BeautifulSoup(htmls, 'html.parser')
try:
texts = bs2.find('div', {"class": 'bmsg job_msg inbox'}).get_text().split()
job_requests = "".join(texts)
worksheet.write(z, 5, label=job_requests)
except:
worksheet.write(z, 5, label=' ')
z += 1
workbook.save('前程无忧.xls')
效果图
爬取财富新闻快讯
from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
f=open('财富.txt', 'w', encoding='UTF-8')
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}
for i in range(2,10):
url ="http://www.cfbond.com/in/cfkxlb/index_{0}.shtml".format(i)
ret=Request(url,headers=headers)
html = urlopen(ret)
bs = BeautifulSoup(html, "html.parser")
titles=bs.findAll('h2',{'class':"pubList_tit"})
#a=bs.findAll('li')
#for j in a:
#titles.append(j.find('h2',class_='pubList_tit').get_text().replace('\r','').replace('\n',''))
for link in bs.findAll('a'):
link=link.get('href')
website=Request(link,headers=headers)
web=urlopen(website)
bs = BeautifulSoup(web, "html.parser")
texts=bs.find_all('div', {'class':'s_xlLContCRC'})
for text in texts:
text=text.get_text().replace(' ','').replace('\r','')
f.write(text)
f.close()
效果图
爬取计算机学报的论文
import os
from urllib.request import Request, urlopen
import bs4
from urllib.request import urlretrieve
url='http://cjc.ict.ac.cn/qwjs/No2020-01.htm'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
ret = Request(url=url, headers=header)
html = urlopen(ret)
bs = bs4.BeautifulSoup(html, 'html.parser')
names=[]
divs=bs.find_all('span' ,{'style':'color:#006688'})
links=bs.find_all('a')
i=0
for div in divs:
div = div.get_text().encode('iso-8859-1').decode('gbk')
names.append(div)
for link in links:
pdf=link.get('href')
dir = os.path.abspath('C:/Users/86186/essary')
work_path = os.path.join(dir, '{}.pdf').format(names[i])
urlretrieve(pdf, work_path)
i=i+1
效果图
爬取豆瓣音乐
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import xlwt
# import time
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('My Worksheet')
worksheet.write(0, 0, "排名")
worksheet.write(0, 1, "歌名")
worksheet.write(0, 2, "歌手")
worksheet.write(0, 3, "发表时间")
worksheet.write(0, 4, "音乐类型")
worksheet.write(0, 5, "评分")
worksheet.write(0, 6, "详细链接")
j = 1
for i in range(10):
url = 'https://music.douban.com/top250?start={}'.format(i * 25)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
ret = Request(url, headers=headers)
html = urlopen(ret).read()
bs = BeautifulSoup(html, "html.parser")
div = bs.find("div", {"class": "indent"})
divs = div.find_all("div", {"class": "pl2"})
for div_pl2 in divs:
title = div_pl2.select('a')[0].text.replace(' ', '')
title = title.replace('\n', ' ').replace('\r', '')
content = div_pl2.find("p", {"class": "pl"}).get_text().split('/')
singer = content[0]
music_time = content[1]
music_type = content[-1]
score = div_pl2.find("span", {"class": "rating_nums"}).get_text()
link = div_pl2.find('a').get('href')
worksheet.write(j, 0, j)
worksheet.write(j, 1, title)
worksheet.write(j, 2, singer)
worksheet.write(j, 3, music_time)
worksheet.write(j, 4, music_type)
worksheet.write(j, 5, score)
worksheet.write(j, 6, link)
# time.sleep(1)
j += 1
workbook.save('豆瓣音乐Top250.xls')
效果图
爬取小说
(非完整版)
import os
from urllib.request import Request, urlopen
import bs4
from urllib.request import urlretrieve
import time
time.sleep(5)
url='https://www.xstt5.com/'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
ret = Request(url=url, headers=header)
html = urlopen(ret)
bs = bs4.BeautifulSoup(html, 'html.parser')
# names=[]
# divs=bs.find_all('span' ,{'style':'color:#006688'})
u = bs.find_all('ul',{'class':"cl"})
for i in u:
links = i.find_all('a')
for s in links:
link = s.attrs['href']
urls = link
infos = Request(urls, headers=header)
htmls = urlopen(infos)
bs1 = bs4.BeautifulSoup(htmls, 'html.parser')
div = bs1.find_all('div',{'class':"ex"})
for j in div:
print(j)
# i = 1
# wangzhan = t.attrs['href']
# print(wangzhan)
# info = Request(wangzhi, headers=header)
# htmlss = urlopen(info)
# bs2 = bs4.BeautifulSoup(htmlss, 'html.parser')
# dir=os.path.abspath('C://Users/user/essary')
# work_path=os.path.join(dir,'{}.txt'.format(i))
# urlretrieve(wangzhi,work_path)
# urls = link
# infos = Request(urls, headers=header)
# htmls = urlopen(infos)
# bs2 = bs4.BeautifulSoup(htmls, 'html.parser')
# dir=os.path.abspath('C://Users/user/essary')
# work_path=os.path.join(dir,'Python-2.7.5.tar.bz2')
# urlretrieve(urls,work_path)
# i=0
# for div in divs:
# div = div.get_text().encode('iso-8859-1').decode('gbk')
# names.append(div)
# for link in links:
# pdf=link.get('href')
# dir = os.path.abspath('C://Users/user/essary')
# work_path = os.path.join(dir, '{}.pdf').format(names[i])
# urlretrieve(pdf, work_path)
# i=i+1
效果图
待学习:
基础正则表达式