Python抓取华中大二手市场商品信息

最近做项目需要获取一些商品信息,于是就写了个简单的脚本来抓取某电子商场。基本原理是发送request请求然后分析response文本信息,正则匹配想要的内容。

#coding=utf-8
#time:2014/4/29
#author:Li
#OS:windows
import requests
import re
import os

def catch_ershou():
	'''抓取华中大二手市场'''
	host_url="http://ershou.hustonline.net"
	#add all the index ,total is 21 pages
	index_url=[]
	for i in range(1,22):
		index_url.append(host_url+'/index/index/'+str(i)+'/all')

	#find all the goods detail page links
	links=[]
	for url in index_url:
		req=requests.get(url)
		req.encoding='utf-8'
		regex=re.compile(r'ui-link-img[^>]+?href="(/goods/details/.+?)"')
		for link in regex.findall(req.content):
			links.append(host_url+link)

	#catch all the goods informations
	good_arr=[]
	for link in links:
		print link
		good_info={"name":"无","price":"无","addr":"无","time":"无","Tags":"无","contact":"无","QQ":"无"}
		req=requests.get(link)
		req.encoding='utf-8'
		try:
			regex=re.compile(r'stock-info-name.+?>(.+?)</h3>')
			info=regex.search(req.content).groups()
			good_info.update({"name":info[0].strip()})

			regex=re.compile(r'stock-price.+?>(.+?)</span>')
			info=regex.search(req.content).groups()
			good_info.update({"price":info[0].strip()})

			regex=re.compile(r'stock-info-attr.+?>([^<]+?)</div>')
			info=regex.findall(req.content)
			good_info.update({"addr":info[0].strip(),"time":info[1].strip(),"Tags":info[2].strip(),"contact":info[3].strip(),"QQ":info[4].strip()})
		except Exception:
			pass	
		good_arr.append(good_info)
	print "total links:"+len(links)	
	#write reasult in file
	try:
		fp=open(os.getcwd()+'\\ershou.txt','a+')
		for good_info in good_arr:
			fp.write('{"名称":'+'"'+good_info["name"]+'",'+\
						'"价格":'+'"'+good_info["price"]+'",'+\
						'"交易地点":'+'"'+good_info["addr"]+'",'+\
						'"发布时间":'+'"'+good_info["time"]+'",'+\
						'"Tags":'+'"'+good_info["Tags"]+'",'+\
						'"联系人":'+'"'+good_info["contact"]+'",'+\
						'"QQ":'+'"'+good_info['QQ']+'"'+\
					"}\r\n")
		fp.close()
	except Exception:
		print "write reasult in file failed!"
	print "all is done..."

def main():
	catch_ershou()


if __name__ == '__main__':
	main()


上一篇:未能加载文件或程序集 Microsoft.ReportViewer.Common, Version=11.0.0.0


下一篇:WPF入门:数据绑定