代码
import requests
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text[:10000]
except Exception as err:
return str(err)
if __name__ == '__main__':
url = 'https://www.amazon.cn/dp/B077B7Z1C1?ref_=Oct_DotdV2_PC_2_GS_DOTD_56a14cb3&pf_rd_r=XF9AG6HRPAWC2MGNWJQ8&pf_rd_p=8c48638a-3752-448a-8685-5a17153fb132&pf_rd_m=A1AJ19PSB66TGU&pf_rd_s=desktop-2'
print(getHTMLText(url))
报503错误
原因
亚马逊对请求来源进行审查
我们的Requests库会忠实的告诉亚马逊请求来源为‘python-requests/2.11.1’
解决办法
import requests
def getHTMLText(url):
try:
kv = {'user-agent': 'Mozilla/5.0'}
r = requests.get(url, headers=kv, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text[:3000]
except Exception as err:
return str(err)
if __name__ == '__main__':
url = 'https://www.amazon.cn/dp/B077B7Z1C1?ref_=Oct_DotdV2_PC_2_GS_DOTD_56a14cb3&pf_rd_r=XF9AG6HRPAWC2MGNWJQ8&pf_rd_p=8c48638a-3752-448a-8685-5a17153fb132&pf_rd_m=A1AJ19PSB66TGU&pf_rd_s=desktop-2'
print(getHTMLText(url))