最近做项目需要获取一些商品信息,于是就写了个简单的脚本来抓取某电子商场。基本原理是发送request请求然后分析response文本信息,正则匹配想要的内容。
#coding=utf-8 #time:2014/4/29 #author:Li #OS:windows import requests import re import os def catch_ershou(): '''抓取华中大二手市场''' host_url="http://ershou.hustonline.net" #add all the index ,total is 21 pages index_url=[] for i in range(1,22): index_url.append(host_url+'/index/index/'+str(i)+'/all') #find all the goods detail page links links=[] for url in index_url: req=requests.get(url) req.encoding='utf-8' regex=re.compile(r'ui-link-img[^>]+?href="(/goods/details/.+?)"') for link in regex.findall(req.content): links.append(host_url+link) #catch all the goods informations good_arr=[] for link in links: print link good_info={"name":"无","price":"无","addr":"无","time":"无","Tags":"无","contact":"无","QQ":"无"} req=requests.get(link) req.encoding='utf-8' try: regex=re.compile(r'stock-info-name.+?>(.+?)</h3>') info=regex.search(req.content).groups() good_info.update({"name":info[0].strip()}) regex=re.compile(r'stock-price.+?>(.+?)</span>') info=regex.search(req.content).groups() good_info.update({"price":info[0].strip()}) regex=re.compile(r'stock-info-attr.+?>([^<]+?)</div>') info=regex.findall(req.content) good_info.update({"addr":info[0].strip(),"time":info[1].strip(),"Tags":info[2].strip(),"contact":info[3].strip(),"QQ":info[4].strip()}) except Exception: pass good_arr.append(good_info) print "total links:"+len(links) #write reasult in file try: fp=open(os.getcwd()+'\\ershou.txt','a+') for good_info in good_arr: fp.write('{"名称":'+'"'+good_info["name"]+'",'+\ '"价格":'+'"'+good_info["price"]+'",'+\ '"交易地点":'+'"'+good_info["addr"]+'",'+\ '"发布时间":'+'"'+good_info["time"]+'",'+\ '"Tags":'+'"'+good_info["Tags"]+'",'+\ '"联系人":'+'"'+good_info["contact"]+'",'+\ '"QQ":'+'"'+good_info['QQ']+'"'+\ "}\r\n") fp.close() except Exception: print "write reasult in file failed!" print "all is done..." def main(): catch_ershou() if __name__ == '__main__': main()