今天有了一个想法,想自己用Python写一个新浪明星日志推荐系统
那么第一步要完成的工作就是获得新浪明星日志的数据,于是自己写了一个爬虫,实现的功能是爬取新浪明星日志的作者,推荐的文章链接,以及作者日志列表或者首页链接,具体程序如下:
# -*- coding: utf-8 -*-
"""
Created on Wed May 20 13:55:00 2015
@author: Administrator
"""
import urllib
import os,re
import sys
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding("utf-8")
if "__main__"==__name__:
i = 1
for j in range(1,140):
url = "http://roll.ent.sina.com.cn/blog/star/index_" + str(i) +".shtml"
fp = file("EveryPageHref.txt","a")
fp.write(url)
fp.write("\n")
fp.close()
i+=1
page = urllib.urlopen(url).read()
soup = BeautifulSoup(page,from_encoding = "gb18030")
list_ul = soup.find_all("ul",class_="list_009")
list_li = list_ul[0].find_all("li")
for li in list_li:
list_a = li.find_all("a")
one_link = list_a[1].get("href") #获取连接
print list_a[0].get_text()
print one_link
if len(one_link)>10:
page = urllib.urlopen(one_link).read()
if len(page)!=0:
href=r'<a class="on" href=.*?>'
link = re.findall(href,page,re.M|re.S)
if link:
a_soup = BeautifulSoup(link[0],from_encoding= "gb18030")
a_href = a_soup.find_all('a')
href = a_href[0].get('href')
print a_href[0].get('href')
fp = file("title.txt","a")
fp.write(list_a[0].get_text())
fp.write("\n")
fp.write(one_link)
fp.write("\n")
fp.write(href)
fp.write("\n")
fp.close()
else:
pass
print "OK!"