python 爬虫爬取腾讯新闻科技类的企鹅智酷系列(1)

废话不多说,直接贴代码,主要采用BeautifulSoup写的

#coding:utf8

from bs4 import BeautifulSoup
import urllib2
import urllib

import os


i = 0
j = 0
list_a = []


def gettext(href):
    global j,list_a
    page = urllib.urlopen(href).read()
    soup = BeautifulSoup(page,from_encoding="gb18030")
    div = soup.find_all("div",class_="content")
    p_text = div[0].find_all("p")
    for p in p_text:
        fp = file("%s.txt" % list_a[j],"a")
        fp.write(' ')
        fp.write(p.get_text())
        fp.write(" \n")
        j+=1


def gethref(url): #获得所有链接
        global i,list_a
        fp = file("AllTitle.txt","w+")
        page = urllib.urlopen(url).read()
        soup = BeautifulSoup(page,from_encoding="gb18030")
        ul = soup.find_all("ul",class_="row1")
        li = ul[0].find_all("li")
        for lia in li:
            list_a.append(("%s、" % (i+1))+lia.h3.get_text())
            href = lia.a.get('href')
            # 将标题简介和链接有规则的写入文件中
            fp.write("%s、" % (i+1))
            i+=1
            fp.write("标题:")
            fp.write(lia.h3.get_text())
            fp.write("\n 简介:")
            fp.write(lia.p.get_text())
            fp.write("\n 链接:")
            fp.write(lia.a.get("href"))
            fp.write("\n")
            gettext(href)


if "__main__"==__name__:
    url ="http://re.qq.com/biznext/zkht.htm"
    gethref(url)
    print "All Is OK!"


上一篇:Codeforces Round #535 (Div. 3) [codeforces div3 难度测评]


下一篇:BeautifulSoup中解决乱码问题