晚上写了一个代理池,就是在一个代理网站上爬取代理ip和端口以及测试是否可用。接下来可能考虑扩展成一个比较大的 golang实现的代理池。
简易版代码:
package main import (
"os"
"fmt"
log "github.com/Sirupsen/logrus"
"io/ioutil"
"strings"
) type New struct {
Prefix string
NewId string
Title string
Time string
Content string
Subject string
} type Subject struct {
Name string
Url string
} func CreateDir(PathName string) error {
err := os.Mkdir(PathName, 0777)
if err != nil && !os.IsExist(err) {
return err
}
return nil
} func AppendFile(SavePath string, FileName string, buf string) {
out, err := os.OpenFile(SavePath+FileName, os.O_WRONLY, 0644)
defer out.Close()
if err != nil {
log.Errorln(err.Error())
return
}
offset, err := out.Seek(0, os.SEEK_END)
if err != nil {
log.Errorln(err.Error())
return
}
_, err = out.WriteAt([]byte(buf), offset)
if err != nil {
log.Errorln(err.Error())
return
}
log.Warnln("Save file finished. Locate in ", SavePath + FileName)
} func PathExists(path string) bool {
_, err := os.Stat(path)
if err == nil {
return true
}
if os.IsNotExist(err) {
return false
}
return false
} func SaveFile(SavePath string, FileName string, buf string) {
out, err := os.Create(SavePath + FileName)
defer out.Close()
fmt.Fprintf(out, "%s", buf)
if err != nil {
log.Errorln(err.Error())
return
}
log.Warnln("Save file finished. Locate in ", SavePath + FileName)
} func ReadAll(path string) ([]byte, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
return ioutil.ReadAll(f)
} func ReadFile(path string) []string {
var fp interface{}
fp, err := ReadAll(path)
if err != nil {
log.Errorln(err.Error())
return nil
}
fp = string(fp.([]byte))
return strings.Split(fp.(string), "\n")
}
package main import (
log "github.com/Sirupsen/logrus"
"math/rand"
"net/http"
"net/url"
"time"
) var userAgent = [...]string {
"Mozilla/5.0 (compatible, MSIE 10.0, Windows NT, DigExt)",
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, 360SE)",
"Mozilla/4.0 (compatible, MSIE 8.0, Windows NT 6.0, Trident/4.0)",
"Mozilla/5.0 (compatible, MSIE 9.0, Windows NT 6.1, Trident/5.0,",
"Opera/9.80 (Windows NT 6.1, U, en) Presto/2.8.131 Version/11.11",
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, TencentTraveler 4.0)",
"Mozilla/5.0 (Windows, U, Windows NT 6.1, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Macintosh, Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh, U, Intel Mac OS X 10_6_8, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Linux, U, Android 3.0, en-us, Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (iPad, U, CPU OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, Trident/4.0, SE 2.X MetaSr 1.0, SE 2.X MetaSr 1.0, .NET CLR 2.0.50727, SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (iPhone, U, CPU iPhone OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"MQQBrowser/26 Mozilla/5.0 (Linux, U, Android 2.3.7, zh-cn, MB200 Build/GRJ22, CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
} func GetRandomUserAgent() string {
var r = rand.New(rand.NewSource(time.Now().UnixNano()))
return userAgent[r.Intn(len(userAgent))]
} func GetFakeHeader(request *http.Request) {
request.Header.Set("User-Agent", GetRandomUserAgent())
request.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
request.Header.Set("Connection", "keep-alive")
//request.Header.Set("Accept-Encoding", "gzip, deflate")
} func GetByProxy(proxyAddr string, Url string) (*http.Response, error) {
timeout := time.Duration(10 * time.Second)
request, err := http.NewRequest(http.MethodGet, Url, nil)
GetFakeHeader(request)
if err != nil {
return nil, err
}
proxy, err := url.Parse(proxyAddr)
if err != nil {
log.Errorln(err.Error())
}
client := &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyURL(proxy),
},
Timeout: timeout,
}
return client.Do(request)
} func GetByDirectory(Url string) (*http.Response, error) {
timeout := time.Duration(10 * time.Second)
request, err := http.NewRequest(http.MethodGet, Url, nil)
GetFakeHeader(request)
if err != nil {
return nil, err
}
client := http.Client{
Timeout: timeout,
}
return client.Do(request)
}
package main import (
log "github.com/Sirupsen/logrus"
"strconv"
"regexp"
"github.com/opesun/goquery"
"time"
) const (
SAVE_PATH = "kproxy.orz"
PROXY_URL = "http://www.kuaidaili.com/free/inha/"
)
var (
IP_REGEXP = regexp.MustCompile(`[\d]+\.[\d]+\.[\d]+\.[\d]+\n\s+[\d]+`)
IP_DETAIL_REGEXP = regexp.MustCompile(`[\d]+\.[\d]+\.[\d]+\.[\d]+`)
INT_REGEXP = regexp.MustCompile(`\s[\d]+`)
) func UrlGetter(num int) string {
return PROXY_URL + strconv.Itoa(num)
} func GetProxy(Url string) {
nod, err := goquery.ParseUrl(Url)
if err != nil {
log.Errorln(err.Error())
return
}
ret := nod.Text()
ips := IP_REGEXP.FindAll([]byte(ret), -1)
var port []string = make([]string, len(ips))
var str string = ""
for i := 0; i < len(ips); i++ {
port[i] = string(INT_REGEXP.FindAll(ips[i], -1)[0])[1:]
ips[i] = IP_DETAIL_REGEXP.FindAll(ips[i], -1)[0]
str += string(ips[i])+":"+port[i]+"\n"
}
AppendFile("./", SAVE_PATH, str)
} func main() {
log.Infoln("Start getting proxy ...")
SaveFile("./", SAVE_PATH, "")
for i := 1; i <= 500; i++ {
log.Println(UrlGetter(i))
GetProxy(UrlGetter(i))
time.Sleep(time.Second*5)
}
}
这里有个python来测试:
import urllib
import urllib2
import os
import socket AIM_URL = ''
PROXY_PATH = './kproxy.orz' class MyException(Exception):
pass def read_file(path):
if not os.path.exists(path):
print 'path : \''+ path + '\' not find.'
return []
content = ''
try:
with open(path, 'r') as fp:
content += reduce(lambda x,y:x+y, fp)
finally:
fp.close()
return content.split('\n') socket.setdefaulttimeout(5)
proxies = read_file(PROXY_PATH)
print len(proxies)
for pp in proxies:
try:
print 'http://'+pp
inforMation = urllib.urlopen(AIM_URL, proxies={'http': 'http://'+pp})
except urllib2.URLError, e:
if isinstance(e.reason, socket.timeout):
pass
except:
pass
finally:
pass