Python之爬虫系列之第一弹

爬取财富500强的内容

from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
url = "http://www.fortunechina.com/fortune500/c/2020-08/10/content_372148.htm"
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}
ret=Request(url,headers=headers)
html=urlopen(ret)
bs=BeautifulSoup(html,"html.parser")
tr=bs.find('tbody').find_all('tr')
#listall=[]
for j in tr[0:]:
    td=j.find_all('td')
    number=td[0].get_text().strip()
    names=td[1].get_text().strip()
    income=td[2].get_text().strip()
    profit=td[3].get_text().strip()
    country=td[4].get_text().strip()
    list_1 = number, income, profit, country, names
    list_2 = '{0:<20}\t{1:<20}\t{2:<20}\t{3:<20}\t{4:<20}'.format(number, income, profit, country, names,chr(12288))
    print(list_2)

效果图
Python之爬虫系列之第一弹

爬取豆瓣TOP250

from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
for i in range(0,250,25):
    url="https://movie.douban.com/top250?start={0}&filter=".format(i)
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}
    ret=Request(url,headers=headers)
    html=urlopen(ret)
    bs=BeautifulSoup(html,"html.parser")
    names=bs.findAll('span',{'class':"title"})
    scores=bs.findAll('span',{'class':"rating_num"})
    numbers=bs.findAll('em',{'class':""})
    number_list=[]
    name_list=[]
    score_list=[]

    for name in names:
        name=name.get_text()
        if name[1] != '/':
            name_list.append(name)
    for number,name,score in zip(numbers,name_list,scores):
        score=score.get_text()
        number=number.get_text()
        print(number,name,score)

效果图
Python之爬虫系列之第一弹

爬取中国大学

from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
url="http://www.shanghairanking.cn/rankings/bcur/2020"
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}
ret=Request(url,headers=headers)
html=urlopen(ret)
bs=BeautifulSoup(html,"html.parser")
#tbody-tr-td
tr=bs.find('tbody').find_all('tr')
#listall=[]
for j in tr[0:]:
    td=j.find_all('td')
    number=td[0].get_text().strip()
    college=td[1].get_text().strip()
    province=td[2].get_text().strip()
    type=td[3].get_text().strip()
    score=td[4].get_text().strip()
    #list_1=[]
    list_1=number,college,province,type,score
    #listall.append(list_1)
    print(number,college,province,type,score)

效果图
Python之爬虫系列之第一弹

爬取中国大学+网站

from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
url="http://www.shanghairanking.cn/rankings/bcur/2020"
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}
ret=Request(url,headers=headers)
html=urlopen(ret)
bs=BeautifulSoup(html,"html.parser")
tr=bs.find('tbody').find_all('tr')
#listall=[]
for j in tr[0:]:
    td=j.find_all('td')
    number=td[0].get_text().strip()
    college=td[1].get_text().strip()
    province=td[2].get_text().strip()
    type=td[3].get_text().strip()
    score=td[4].get_text().strip()
    a=j.find_all('a')
    wangzhi=a[0].get('href')
    link='http://www.shanghairanking.cn{0}'.format(wangzhi)
    list_1=number,college,province,type,score,link
    print(number,college,province,score,link)

效果图
Python之爬虫系列之第一弹

爬取列维坦风景油画

import os
from urllib.request import Request, urlopen
import bs4
from urllib.request import urlretrieve
url='https://www.sohu.com/a/286956359_301394'
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
ret = Request(url=url, headers=header)
html = urlopen(ret)
bs = bs4.BeautifulSoup(html, 'html.parser')
pictures=bs.find('article',{'class':"article"}).find_all('img')
i=1
for picture in pictures:
    picture = picture.attrs['src']
    dir = os.path.abspath('C:/Users/86186/picture')
    work_path = os.path.join(dir,'picture{}.jpeg').format(i)
    i=i+1
    urlretrieve(picture, work_path)

效果图
Python之爬虫系列之第一弹

爬取前程无忧网

from urllib.request import Request, urlopen
import bs4
import requests
import re
import json
import xlwt

workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('前程无忧招聘信息')
worksheet.write(0, 0, label='工作名称')
worksheet.write(0, 1, label='公司名称')
worksheet.write(0, 2, label='工作地区')
worksheet.write(0, 3, label='公司属性')
worksheet.write(0, 4, label='职位要求')
worksheet.write(0, 5, label='职责要求')
z = 1

for x in range(1,6):
    url=('https://search.51job.com/list/080200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,'+str(x))+'.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
    ret = Request(url=url, headers=header)
    html = urlopen(ret)
    bs = bs4.BeautifulSoup(html, 'html.parser')
    names=bs.find_all('script', type="text/javascript")

    for name in names:
        t = name.get_text()
        if len(t) > 0:
            t = t.replace('window.__SEARCH_RESULT__ = ', '')
            x = json.loads(t)
            y = x["engine_search_result"]

            for j in range(1, len(y)):
                company_names = y[j]['company_name']
                job_names = y[j]['job_name']
                salarys = y[j]['providesalary_text']
                hrefs = y[j]['job_href']

                worksheet.write(z, 0, y[j]['job_name'])
                worksheet.write(z, 1, y[j]['company_name'])
                worksheet.write(z, 2, y[j]['workarea_text'])
                worksheet.write(z, 3, y[j]['companytype_text'])
                worksheet.write(z, 4, y[j]['attribute_text'])
                urls = hrefs
                infos = Request(urls, headers=header)
                htmls = urlopen(infos)
                bs2 = bs4.BeautifulSoup(htmls, 'html.parser')
                try:
                    texts = bs2.find('div', {"class": 'bmsg job_msg inbox'}).get_text().split()
                    job_requests = "".join(texts)
                    worksheet.write(z, 5, label=job_requests)
                except:
                    worksheet.write(z, 5, label=' ')
                z += 1
            workbook.save('前程无忧.xls')

效果图
Python之爬虫系列之第一弹

爬取财富新闻快讯

from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
f=open('财富.txt', 'w', encoding='UTF-8')

headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}
for i in range(2,10):
    url ="http://www.cfbond.com/in/cfkxlb/index_{0}.shtml".format(i)
    ret=Request(url,headers=headers)
    html = urlopen(ret)
    bs = BeautifulSoup(html, "html.parser")
    titles=bs.findAll('h2',{'class':"pubList_tit"})
    #a=bs.findAll('li')
    #for j in a:
        #titles.append(j.find('h2',class_='pubList_tit').get_text().replace('\r','').replace('\n',''))
    for link in bs.findAll('a'):
        link=link.get('href')
        website=Request(link,headers=headers)
        web=urlopen(website)
        bs = BeautifulSoup(web, "html.parser")
        texts=bs.find_all('div', {'class':'s_xlLContCRC'})
        for text in texts:
            text=text.get_text().replace(' ','').replace('\r','')
            f.write(text)
f.close()

效果图
Python之爬虫系列之第一弹

爬取计算机学报的论文

import os
from urllib.request import Request, urlopen
import bs4
from urllib.request import urlretrieve
url='http://cjc.ict.ac.cn/qwjs/No2020-01.htm'
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
ret = Request(url=url, headers=header)
html = urlopen(ret)
bs = bs4.BeautifulSoup(html, 'html.parser')
names=[]
divs=bs.find_all('span' ,{'style':'color:#006688'})
links=bs.find_all('a')
i=0
for div in divs:
    div = div.get_text().encode('iso-8859-1').decode('gbk')
    names.append(div)
for link in links:
    pdf=link.get('href')
    dir = os.path.abspath('C:/Users/86186/essary')
    work_path = os.path.join(dir, '{}.pdf').format(names[i])
    urlretrieve(pdf, work_path)
    i=i+1

效果图
Python之爬虫系列之第一弹

爬取豆瓣音乐

from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import xlwt
# import time

workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('My Worksheet')
worksheet.write(0, 0, "排名")
worksheet.write(0, 1, "歌名")
worksheet.write(0, 2, "歌手")
worksheet.write(0, 3, "发表时间")
worksheet.write(0, 4, "音乐类型")
worksheet.write(0, 5, "评分")
worksheet.write(0, 6, "详细链接")

j = 1
for i in range(10):
    url = 'https://music.douban.com/top250?start={}'.format(i * 25)
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
    ret = Request(url, headers=headers)
    html = urlopen(ret).read()
    bs = BeautifulSoup(html, "html.parser")
    div = bs.find("div", {"class": "indent"})
    divs = div.find_all("div", {"class": "pl2"})
    for div_pl2 in divs:
        title = div_pl2.select('a')[0].text.replace(' ', '')
        title = title.replace('\n', ' ').replace('\r', '')
        content = div_pl2.find("p", {"class": "pl"}).get_text().split('/')
        singer = content[0]
        music_time = content[1]
        music_type = content[-1]
        score = div_pl2.find("span", {"class": "rating_nums"}).get_text()
        link = div_pl2.find('a').get('href')
        worksheet.write(j, 0, j)
        worksheet.write(j, 1, title)
        worksheet.write(j, 2, singer)
        worksheet.write(j, 3, music_time)
        worksheet.write(j, 4, music_type)
        worksheet.write(j, 5, score)
        worksheet.write(j, 6, link)
        # time.sleep(1)
        j += 1
workbook.save('豆瓣音乐Top250.xls')

效果图
Python之爬虫系列之第一弹

爬取小说

(非完整版)

import os
from urllib.request import Request, urlopen
import bs4
from urllib.request import urlretrieve
import time
time.sleep(5)
url='https://www.xstt5.com/'
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
ret = Request(url=url, headers=header)
html = urlopen(ret)
bs = bs4.BeautifulSoup(html, 'html.parser')
# names=[]
# divs=bs.find_all('span' ,{'style':'color:#006688'})
u = bs.find_all('ul',{'class':"cl"})
for i in u:
    links = i.find_all('a')
    for s in links:
        link = s.attrs['href']
        urls = link
        infos = Request(urls, headers=header)
        htmls = urlopen(infos)
        bs1 = bs4.BeautifulSoup(htmls, 'html.parser')
        div = bs1.find_all('div',{'class':"ex"})

        for j in div:
           print(j)
                # i = 1
                # wangzhan = t.attrs['href']
                # print(wangzhan)
                # info = Request(wangzhi, headers=header)
                # htmlss = urlopen(info)
                # bs2 = bs4.BeautifulSoup(htmlss, 'html.parser')
                # dir=os.path.abspath('C://Users/user/essary')
                # work_path=os.path.join(dir,'{}.txt'.format(i))
                # urlretrieve(wangzhi,work_path)

# urls = link
# infos = Request(urls, headers=header)
# htmls = urlopen(infos)
# bs2 = bs4.BeautifulSoup(htmls, 'html.parser')
# dir=os.path.abspath('C://Users/user/essary')
# work_path=os.path.join(dir,'Python-2.7.5.tar.bz2')
# urlretrieve(urls,work_path)

# i=0
# for div in divs:
#     div = div.get_text().encode('iso-8859-1').decode('gbk')
#     names.append(div)
# for link in links:
#     pdf=link.get('href')
#     dir = os.path.abspath('C://Users/user/essary')
#     work_path = os.path.join(dir, '{}.pdf').format(names[i])
#     urlretrieve(pdf, work_path)
#     i=i+1

效果图
Python之爬虫系列之第一弹
待学习:
基础正则表达式

上一篇:bootstrap的modal加载前后执行


下一篇:LeetCode222. 完全二叉树的节点个数