python 模拟浏览器

想用python模拟浏览器访问web的方法测试些东西,有哪几种方法呢?

一类:单纯的访问web,不解析其js,css等。

1. urllib2

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#-*- coding:utf-8 -*
import urllib2
 
def Furllib2(ip,port,url,timeout):
    proxydict = {}
    proxydict[‘http‘] = "http://%s:%s"%(ip,port)
    print proxydict
    proxy_handler = urllib2.ProxyHandler(proxydict)
    opener = urllib2.build_opener(proxy_handler)
    opener.addheaders = [(‘User-agent‘, ‘Mozilla/5.0‘)]
    urllib2.install_opener(opener)
    try:
        response = urllib2.urlopen(url,timeout=timeout)
        print response.geturl()
        print response.getcode()
        print response.info()
        print response.read()
        return True
    except:
        print ‘some errors occored‘ + ‘-‘*50
        return 0
 
def main():
    proxyip = ‘14.18.16.69‘
    proxyport = ‘80‘
    proxy = http://2.181.1.127:80
    url = http://www.cnblogs.com/
    timeout = 4
    print Furllib2(proxyip,proxyport,url,timeout)
 
 
if __name__ == "__main__":
    main()

 2. mechanize(与网站的自动化交互)

http://wwwsearch.sourceforge.net/mechanize/doc.html

1
2
3
4
5
6
7
8
9
10
11
12
def Fmechanize(url):
    cookies = mechanize.CookieJar()
    opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies))
    try:
        r = opener.open(url)  # GET
        # r = opener.open("http://example.com/", data)  # POST
        print r.geturl()
        print r.info()
        return True
 
    except:
        return 0

 二类:模拟浏览器,使用firefox等的浏览器引擎,支持js,css等。

1. selenium 的firefox或者chrome等驱动,但是由于要打开一个浏览器,所以会比较慢(浏览器驱动可以到selenium官网上下载,也可以到firefox插件出搜索)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
def Fselenium_firefox(ip,port,url,timeout):
    try:
 
        profile = webdriver.FirefoxProfile()
        profile.set_preference(‘network.proxy.type‘, 1)
        profile.set_preference(‘network.proxy.http‘,ip)
        profile.set_preference(‘network.proxy.http_port‘, port)
        profile.update_preferences()
        driver = webdriver.Firefox(profile,timeout = timeout)
    except Exception:
        print traceback.print_exc()
        return 0
        pass
    try:
 
        driver.get(url)
        time.sleep(5)
        cookies= driver.get_cookies()
        print cookies
        # driver.get()
 
        driver.quit()
        return 1
 
 
    except Exception:
        traceback.print_exc()
        # print ‘not have Union allianceid‘
        driver.quit()
        return 0

 2. selenium :headless test使用selenium+ phantomjs驱动,无需打开浏览器,但是支持js的模拟浏览器动作,也就说说和你手工打开是没有区别的。

http://selenium.googlecode.com/git/docs/api/py/api.html

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def Fselenium_phantomjs(ip,port,url,timeout):
    try:
        proxyip = ‘%s%s%s%s‘%(‘--proxy=‘,ip,‘:‘,port)
        proxyport = ‘--proxy-type=http‘
        service_args = []
        service_args.append(proxyip)
        service_args.append(proxyport)
        print service_args
 
        driver = webdriver.PhantomJS(service_args = service_args)
        driver.set_page_load_timeout(timeout)
        driver.get(url)
        time.sleep(4)
    except Exception:
        traceback.print_exc()
 
    try:
        geturl = driver.current_url
        print driver.current_url
        return True
    except Exception:
        traceback.print_exc()
        geturl = None
        return 0

 3. qt,网上戗来的代码

http://qt-project.org/wiki/PySide#PySide.QtWebKit.PySide.QtWebKit.QWebView.url

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from PyQt4 import QtCore, QtGui, QtWebKit, QtNetwork
 
class cookieJar(QtNetwork.QNetworkCookieJar):
    def __init__(self, cookiesKey, parent=None):
        super(cookieJar, self).__init__(parent)
 
        self.mainWindow = parent
        self.cookiesKey = cookiesKey
        cookiesValue    = self.mainWindow.settings.value(self.cookiesKey)      
 
        if cookiesValue:
            cookiesList = QtNetwork.QNetworkCookie.parseCookies(cookiesValue)
            self.setAllCookies(cookiesList)
 
   # def setCookiesFromUrl (self, cookieList, url):
    #    cookiesValue = self.mainWindow.settings.value(self.cookiesKey)
     #   cookiesArray = cookiesValue if cookiesValue else QtCore.QByteArray()
 
      #  for cookie in cookieList:
       #     cookiesArray.append(cookie.toRawForm() + "\n")
 
        #self.mainWindow.settings.setValue(self.cookiesKey, cookiesArray)
 
        #return super(cookieJar, self).setCookiesFromUrl(cookieList, url)
    def deleteCookie(self,cookieList):
    cookie = []
    self.mainWindow.settings.value(cookie)
class webView(QtWebKit.QWebView):
    def __init__(self, cookiesKey, url, parent=None):
        super(webView, self).__init__(parent)
 
        self.cookieJar = cookieJar(cookiesKey, parent)
 
        self.page().networkAccessManager().setCookieJar(self.cookieJar)
 
class myWindow(QtGui.QMainWindow):
    def __init__(self, parent=None):
        super(myWindow, self).__init__(parent)
 
        self.cookiesKey = "cookies"
 
        self.centralwidget = QtGui.QWidget(self)
 
        self.tabWidget = QtGui.QTabWidget(self.centralwidget)
        self.tabWidget.setTabsClosable(True)
 
        self.verticalLayout = QtGui.QVBoxLayout(self.centralwidget)
        self.verticalLayout.addWidget(self.tabWidget)
 
        self.actionTabAdd = QtGui.QAction(self)
        self.actionTabAdd.setText("Add Tab")
        self.actionTabAdd.triggered.connect(self.on_actionTabAdd_triggered)
 
        self.lineEdit = QtGui.QLineEdit(self)
        self.lineEdit.setText("http://www.example.com")
 
        self.toolBar = QtGui.QToolBar(self)
        self.toolBar.addAction(self.actionTabAdd)
        self.toolBar.addWidget(self.lineEdit)
 
        self.addToolBar(QtCore.Qt.ToolBarArea(QtCore.Qt.TopToolBarArea), self.toolBar)
        self.setCentralWidget(self.tabWidget)
 
        self.settings = QtCore.QSettings()
 
    @QtCore.pyqtSlot()
    def on_actionShowCookies_triggered(self):
        webView = self.tabWidget.currentWidget()
        listCookies = webView.page().networkAccessManager().cookieJar().allCookies()
 
        for cookie in  listCookies:
            print cookie.toRawForm()
 
    @QtCore.pyqtSlot()
    def on_actionTabAdd_triggered(self):
        url = self.lineEdit.text()
        self.addNewTab(url if url else ‘about:blank‘)
 
    def addNewTab(self, url):
        tabName = u"Tab {0}".format(str(self.tabWidget.count()))
 
        tabWidget= webView(self.cookiesKey, url, self)
        tabWidget.loadFinished.connect(self.on_tabWidget_loadFinished)
        tabWidget.load(QtCore.QUrl(url))
 
        tabIndex = self.tabWidget.addTab(tabWidget, tabName)
 
        self.tabWidget.setCurrentIndex(tabIndex)
 
    @QtCore.pyqtSlot()
    def on_tabWidget_loadFinished(self):
        cookies2 = self.settings.value(self.cookiesKey)
     
     
if __name__ == "__main__":
    import sys
 
    app = QtGui.QApplication(sys.argv)
    app.setApplicationName(‘myWindow‘)
 
    main = myWindow()
    main.resize(666, 333)
    main.show()
 
    sys.exit(app.exec_())

 

4. qt-headless

http://qt-project.org/wiki/PySide#PySide.QtWebKit.PySide.QtWebKit.QWebView.url

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import sys 
from PyQt4.QtGui import * 
from PyQt4.QtCore import * 
from PyQt4.QtWebKit import * 
   
class Render(QWebPage): 
  def __init__(self, url): 
    self.app = QApplication(sys.argv) 
    QWebPage.__init__(self
    self.loadFinished.connect(self._loadFinished) 
    self.mainFrame().load(QUrl(url)) 
    self.app.exec_() 
   
  def _loadFinished(self, result): 
    self.frame = self.mainFrame() 
    self.app.quit() 
   
url = http://webscraping.com 
r = Render(url) 
html = r.frame.toHtml() 
print html

 5. splinter :打开浏览器,模拟操作,python的

http://splinter.cobrateam.info/docs/tutorial.html

1
2
3
4
>>> from splinter import Browser
>>> browser = Browser()
>>> browser.visit(url)

 

 

 

具体用哪个要看你有什么具体的需求了

python 模拟浏览器,布布扣,bubuko.com

python 模拟浏览器

上一篇:C++实现文件关联


下一篇:eclipse+MinGW(C++11)