1 #coding:utf-8
2 import urllib2
3 import os,sys
4 from BeautifulSoup import BeautifulSoup # For processing HTML
5 from bs4 import BeautifulSoup
6 class BookSave():
7 ‘‘‘
8 dir:html文件保存目录 url:index.html目录 static_url:js、css所在目录的上级目录
9 distinguish:用来区分相同tag.name dis_key:所需的tag属性 key1:所取tag
10 key2:tag属性 key3:tag属性值
11 ‘‘‘
12 def __init__(self,dir,url,static_url,distinguish,dis_key,key1,key2,key3):
13 self.dir=dir
14 self.url = url
15 self.static_url = static_url
16 self.distinguish = distinguish
17 self.dis_key = dis_key
18 self.key1 = key1
19 self.key2 = key2
20 self.key3 = key3
21
22 def AddUrl(self):
23 if self.dir != ‘‘:
24 list = os.listdir(self.dir) #列出目录下的所有文件和目录
25 for line in list:
26 if os.path.isdir(line):
27 continue
28 elif os.path:
29 self.JieXiCsss(line)
30 self.JieXiJs(line)
31
32 def JieXiCsss(self,file):
33 filePath = os.path.join(self.dir,file)
34 print filePath
35 fp = open(filePath)
36 soup = BeautifulSoup(fp)
37 head = soup.head
38 tags = head.findAll(‘link‘)#,{‘rel‘:‘stylesheet‘}
39 if tags != []:
40 for item in tags:
41 try:
42 item[‘href‘] = self.static_url + item[‘href‘]
43 print item[‘href‘]
44 except KeyError:
45 continue
46 else :
47 print tags,filePath
48 self.SaveHtml(soup,filePath)
49
50 def JieXiJs(self,file):
51 filePath = os.path.join(self.dir,file)
52 fp = open(filePath)
53 soup = BeautifulSoup(fp)
54 head = soup.head
55 tags = head.findAll(‘script‘)#,{‘rel‘:‘stylesheet‘}
56 if tags != []:
57 for item in tags:
58 try:
59 item[‘src‘] = self.static_url + item[‘src‘]
60 print item[‘src‘]
61 self.SaveHtml(soup,filePath)
62 except KeyError:
63 continue
64 else :
65 print tags,filePath
66 self.SaveFile(soup,filePath)
67
68 def SaveFile(self,soup,file):
69 html = str(soup)
70 with open(file,‘wb‘) as code:
71 code.write(html)
72
73 def IsNullArr(self,Arr):
74 if Arr != []:
75 return Arr
76 else:
77 print ‘array is null‘
78
79 def DownLoadHtml(self,arr):
80 tags = bs.IsNullArr(arr)
81 for item in tags:
82 liName = item.parent.name
83 if any(liName in s for s in self.distinguish):
84 continue
85 else:
86 htmlUrl = self.url + item[self.dis_key]
87 print htmlUrl
88 fileName = os.path.join(self.dir,item[self.dis_key])
89 print ‘saving:‘ + htmlUrl
90 self.SaveHtml(fileName,htmlUrl)
91
92 def SaveHtml(self,fileName,htmlUrl):
93 f = urllib2.urlopen(htmlUrl)
94 html = f.read()
95 with open(fileName,"wb") as code:
96 code.write(html)#.decode(‘utf-8‘)
97
98 def GetSearchResult(self):
99 doc = urllib2.urlopen(self.url)
100 soup = BeautifulSoup(doc)
101 soup.originalEncoding
102 tag = soup.findAll(self.key1,{self.key2:self.key3})
103 return tag
104
105 def SplitString(self,source,sep):
106 return source.strip().split(‘/‘)
107
108 def CreateDir(self):
109 if not os.path.exists(self.dir):
110 os.makedirs(os.path.join(self.dir))
111 if __name__==‘__main__‘:
112 urls = ‘http://docs.python.org/2/library/‘
113 static_url = ‘http://docs.python.org/2/‘
114 dirs = ‘E:/demo/PythonLib1/‘
115 bs = BookSave(dirs,urls,static_url,‘p‘,‘href‘,‘a‘,‘class‘,‘reference internal‘)
116 bs.CreateDir()
117 fileName = os.path.join(dirs,‘index.html‘)
118 htmlUrl = urls + ‘index.html‘
119 bs.SaveHtml(fileName,htmlUrl)
120 tags = bs.GetSearchResult()
121 #print tags
122 bs.DownLoadHtml(tags)
123 bs.AddUrl()