本文是对本专栏的简单复习。
至此,本专栏已经完结。后续最多只是修补一些内容上的bug了。
写这个专栏的初衷其实是为了自己的复习,如果这些内容对你的学习能起到帮助,那便是我的荣幸。
最后的感悟大概就是:
依然要勤学苦练,最终与实践结合。我们学习代码这个工具就是为了实战使用,而不只是学习函数。
一定要持之以恒的学习,并与实践融合。
本文于2021/12/22首发于****,有不足请指出。
'''1.爬虫'''
import chardet
import requests
url = ''
ua = {"User-Agent": " "}
rqg = requests.get(url, headers=ua)
rqg.encoding = chardet.detect(rqg.content)['encoding']
html = rqg.content.decode('utf-8')
from bs4 import BeautifulSoup
from lxml import etree
soup = BeautifulSoup(html, 'lxml')
tag = soup.ul
tag.attrs
tag.li.get_text()
tag.get('herf')
soup.find_all('ul')
urls = []
herfs = []
for i in tag.find_all('a'):
urls.append(i.get_text())
herfs.apepend(i.get('herf'))
for i in tag.find_all('a'):
print(i.get('herf'), i.get_text(), end ='\n' )
xp = etree.HTML(html, parser=etree.HTMLParser(encoding='utf-8'))
xp.xpath('//a')
xp.xpath("body/div/a[starts-with(@id,'co')]")
xp.xpath("//a/text()")
j = xp.xpath('//p[starts-with(@id)]')
for i in j:
t = i.xpath('string(.)')
print(t)
import re
title_pattern = r'<title (.*?)>(.*?)</title>'
title_com = re.compile(title_pattern, re.M|re.S)
title_find = re.findall(title_com, rqg.text)
import time
from selenium import webdiver
div = webdiver.Chrome('./chromediver')
div.get(url)
time.sleep(5)
html = div.page_source
element = div.find_element_by_id('pass') # 只返回第一个
elements = div.find_elements_by_name('a') # 返回列表
element2 = div.find_element_by_xpath("//p[@id='pass]")
element3 = div.find_element_by_tag_name('div')
import time
import requests
import json
url = ''
ua = {"User-Agent": ' '}
# html = requests.get(url, headers=ua).content.decode('utf-8')
html = requests.get(url, headers=ua).text
data = json.loads(html)
dic = data['data']
for i in dic:
print(i['picPath'], i['bookName'])
import requests
import time
from selenium import webdiver
div1 = webdiver.Chrome('./chromediver')
div1.get(url)
time.sleep(5)
# html = div.page_source
e1 = div.find_elements_by_xpath("//div[@class='book']")
for book in e1:
print(book.text)
div.quit()
'''2.写入文件'''
with open(r'c:\file.txt', 'a+') as f:
f.write(rqg.text)
import xlwings as xw
wb = xw.Book(r"c:\excel.xlsx")
sht = wb.sheets['Sheet1']
sht.range('A2').value = ['aaa', 'nnn'] # A2单元格,value值也可以是dataframe
'''3.pandas'''
import numpy as np
import pandas as pd
np.random.random((4, 5)) # [0, 1)浮点数
np.random.rand(4, 5) # 均匀分布
np.random.randn(4, 5) # (10) 一行十个数数列 正态分布
np.random.randint(5, 10, size=[2, 5]) # [5,10]整数
arr1 = np.arange(4)
arr1.ravel() # 列向展平
arr1.flatten
arr2 = np.arange(1)
arr_st = np.concatenate((arr1, arr2), axis=0) # axis这是纵向(行向)叠加
arr_sp = np.split(arr1, 2, axis=1) # 横向(列向)切割 行0,列1
# header=0,无列标题时默认使用。
# 如果有列标题时强行用0,会替换掉列名,列名下面一行当标题。
# =None,有列标题时默认使用
df1 = pd.read_csv(r"c:/df.csv", header=None, index_col='city')
df1.values
df1.index
df1.columns
df1.dtypes
df1.size
df1.ndim
df1.shape
df1.describe()
df1.info()
df1['city'].mean() # var, std
# df2 = df1.set_index('city')
df1.loc[:, ['city', 'sex']]
df1[['city', 'sex']]
df1.iloc[:, :2]
df1[:2]
# df1[:, :2] 不对
# df1[0] 不对
df1.loc[df1]
df1[(df1["city"] == '北京') & (df1["sex"] == 'female')]
data = {"city":'lanzhou', "sex":"female"}
df1.append(data, ignore_index=True) # 防止索引冲突
df1['age'] = [20, 19, 21]
df1.drop([1, 3]) # 删除1,3行
df1.drop(columns=["age", "city"]) # index= ,或用axis
pd.to_csv(r"C:\i.csv", sep=',')
groupby = df1.groupby('分公司')[['薪水', '小时报酬']].agg['min', 'max']
pivot_table = df1.pivot_table(values=['小时报酬', '薪水'], index=['分公司', '部门'])
concat_join = pd.concat([df1[:2], df1[2:]], axis=1, join='inner')
# inner内连接去除悬浮元组,outer保留悬浮元组
merge = pd.merge(df1[:2], df1[2:], left_on='学号', right_on="学号")
combine = df1.combine_first(df2) # 对比合并重复数据
df1["姓名"].drop_duplicates() # 去除重复值
df1.isnull() # notnull
df1.dropna(axis=0, how='all') # all行全缺失值才删,any有就删
df1['小时报酬'].fillna(df1['小时报酬'].mean())
# inter1d, make_interp_spline
from scipy.interplote import lagrange
l1 = lagrange(x, y1)
l1([6, 7]) # x=6,7时,y1的结果
def outRange(ser):
bool = (ser < ser.mean() -3*ser.std()) | (ser > ser.mean() + 3*ser.std())
index = np.arange(ser.shape[0])[bool]
outrange = ser.iloc[index]
return outrange
outlier = outRange(df1["age"])
pd.get_dummies(df1["name"]) # 哑变量
import matplotlib.pyplot as plt
plt.bar() # barh,plot,boxplot,stackplot,hist,pie,scatter,polar,errorbar
plt.figure()
plt.xlabel('x轴标签')
plt.xticks(['a', 'b', 'c']) # 设置刻度标签
plt.title('设置标题')
plt.legend() # 会自动显示
plt.legend(lines, ['线条1', "线条2"], loc='best')
plt.grid(visible=True) # 显示网格
ax, fig = plt.subplots(2, 2, figsize=(10,5))
ax1 = ax[1, 0]
ax1.plot() # 画ax1的图