第一

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import time
import urllib, time, os, base64, json
import re, sys
from lxml import etree

from urllib.request import urlopen
from urllib import request


def getPage(base_url):
    try:
        req = request.Request(base_url)

        page = urllib.request.urlopen(req)  # 5
        content = page.read().decode("utf-8")  # 2
        re_tag = re.compile('\<[\S\s]+?\>', re.I)
        re_cr = re.compile('\s{1,}', re.I)
        re_script = re.compile('\<script[\S\s]+?\</script\>', re.I)  # Script
        re_style = re.compile('\<style[\S\s]+?\</style\>', re.I)  # style
        content = re_script.sub('', content)  # 去掉SCRIPT  2
        content = re_style.sub('', content)  # 去掉style  2
        content = re_tag.sub('', content)  # 去除所有尖括号内的HTML代码 2
        contentStr = re_cr.sub('', content)  # 去掉换行  2

        # answer two
        writefile("/home/output/crawler_result.csv", contentStr)  # 2

    except Exception as e:  # 1
        print("Failed to read from %s." % base_url)
        print(sys.exc_info())
        return False


def writefile(filename, content):
    try:
        fp = open(filename, 'a')  # 5
        fp.write(content + "\n")  # 5
        fp.close()  # 5
    except:
        return False


now = time.strftime('%Y-%m-%d %X', time.localtime(time.time()))

try:
    # 5
    url = 'http://117.73.9.229:9090/'
    getPage(url)

except Exception as e:
    info = '%s\nError: %s' % (now, e)
    writefile('Error.log', info)
    print(info)
    time.sleep(1)
上一篇:O2OA编译报错的问题


下一篇:源码详解 Comparable 和 Comparator 接口, compareTo 方法和 compare 方法的区别和使用