python 爬虫-sohu抓小说

#coding:utf-8
import urllib2
import sys
import re def getPage(url,offset = ''):
realurl = "%s%s%s" %(url,offset,'.shtml')
print realurl
resp = urllib2.urlopen(realurl)
content = resp.read()
#print content
p = re.compile('<[^>]+>')
p1=re.compile('<[^>p]+>')
print p
rematch = re.compile(r'(<h1.*</h1>)')
h1 = rematch.findall(content)
print h1[0],'ok'
try:
h1content = p.sub("",h1[0])
print h1content
except Exception,e:
print str(e),'error'
return
fp = open(r'juyudao.txt','a')
fp.write(h1content+ '\n')
fp.flush() #print content content = content.replace('\r','')
content = content.replace('\n','') content = content.replace(' ','') content = content.replace(' ','')
cont = re.search('articleBody(.*)class="pages">', content, re.S)#先获取一部分html
#print 'cont1',cont.group()
cont1=cont.group()
articleBody=re.findall('</script>(.*)<divclass="pages">',cont1)
#print articleBody
articleBody=articleBody[0].replace('</p>','')
articleBody=p1.sub('',articleBody)
txt=articleBody.split('<p>')
for i in txt:
fp.write(i+ '\n')
fp.flush() fp.close() def getBook(url, startoffset, endOffset):
while startoffset < endOffset:
getPage(url, offset = str(startoffset))
startoffset += 1 if __name__ == '__main__':
getPage(url = 'http://book.sohu.com/20131107/n389762800',offset='')
getBook(url = 'http://book.sohu.com/20131107/n389762800_',startoffset=1,endOffset=20)
上一篇:2、MyEclipse和Eclipse调优,MyEclipse配置(tomcat和jdk的内存设置),jar引入相关知识点,将Java项目编程web项目的办法


下一篇:linux php 安装GD库