download:畅销3年的Python分布式爬虫课程 Scrapy打造搜索引擎
未来是什么时代?是数据时代!数据分析服务、互联网金融,数据建模、自然语言处理、医疗病例分析……越来越多的工作会基于数据来做,而爬虫正是快速获取数据最重要的方式,相比其它语言,Python爬虫更简单、高效
适合人群
适合对爬虫感兴趣、想做大数据开发却找不到数据
又不知如何搭建一套稳定可靠的分布式爬虫的同学
想搭建搜索引擎但是不知道如何入手的同学
技术储备要求
具备一定的原生爬虫基础
了解前端页面,面向对象概念,计算机网络协议和数据库知识
代码如下:
package main
import (
“fmt”
“math/rand”
“time”
)
var (
Web = fakeSearch(“web”)
Image = fakeSearch(“image”)
Video = fakeSearch(“video”)
)
type Result string
type Search func(query string) Result
func fakeSearch(kind string) Search {
return func(query string) Result {
time.Sleep(time.Duration(rand.Intn(100)) * time.Millisecond)
return Result(fmt.Sprintf("%s result for %q\n", kind, query))
}
}
func Google(query string) (results []Result) {
results = append(results, Web(query))
results = append(results, Image(query))
results = append(results, Video(query))
return
}
func main() {
rand.Seed(time.Now().UnixNano())
start := time.Now()
results := Google(“golang”)
elapsed := time.Since(start)
fmt.Println(results)
fmt.Println(elapsed)
}
运转结果如下:
[web result for “golang”
image result for “golang”
video result for “golang”
]
153.365484ms
谷歌搜索2.0
同时运转网页、图像和视频搜索,并等候一切结果。没有锁,没有条件变量,没有回调。
代码如下,关注Google函数。
package main
import (
“fmt”
“math/rand”
“time”
)
var (
Web = fakeSearch(“web”)
Image = fakeSearch(“image”)
Video = fakeSearch(“video”)
)
type Result string
type Search func(query string) Result
func fakeSearch(kind string) Search {
return func(query string) Result {
time.Sleep(time.Duration(rand.Intn(100)) * time.Millisecond)
return Result(fmt.Sprintf("%s result for %q\n", kind, query))
}
}
func Google(query string) (results []Result) {
c := make(chan Result)
go func() { c <- Web(query) } ()
go func() { c <- Image(query) } ()
go func() { c <- Video(query) } ()
for i := 0; i < 3; i++ {
result := <-c
results = append(results, result)
}
return
}
func main() {
rand.Seed(time.Now().UnixNano())
start := time.Now()
results := Google(“golang”)
elapsed := time.Since(start)
fmt.Println(results)
fmt.Println(elapsed)
}
谷歌搜索2.1 不要等候迟缓的效劳器。没有锁,无条件变量,没有回调。经过select的超时完成,需求把time.After定义的超时通道放在for循环外层。
package main
import (
“fmt”
“math/rand”
“time”
)
var (
Web = fakeSearch(“web”)
Image = fakeSearch(“image”)
Video = fakeSearch(“video”)
)
type Result string
type Search func(query string) Result
func fakeSearch(kind string) Search {
return func(query string) Result {
time.Sleep(time.Duration(rand.Intn(100)) * time.Millisecond)
return Result(fmt.Sprintf("%s result for %q\n", kind, query))
}
}
func Google(query string) (results []Result) {
c := make(chan Result)
go func() { c <- Web(query) } ()
go func() { c <- Image(query) } ()
go func() { c <- Video(query) } ()
timeout := time.After(80 * time.Millisecond)
for i := 0; i < 3; i++ {
select {
case result := <-c:
results = append(results, result)
case <-timeout:
fmt.Println(“timed out”)
return
}
}
return
}
func main() {
rand.Seed(time.Now().UnixNano())
start := time.Now()
results := Google(“golang”)
elapsed := time.Since(start)
fmt.Println(results)
fmt.Println(elapsed)
}
谷歌搜索3.0 内容从48页到51页。
运用复制的搜索效劳器减少尾部延迟。同样没有锁,没有条件变量,没有回调。
问:我们如何防止由于效劳器运转迟缓而丢弃结果?
答: 复制效劳器。 向多个副本发送恳求,并运用第一个响应。
代码如下:
package main
import (
“fmt”
“math/rand”
“time”
)
var (
Web1 = fakeSearch(“web”)
Web2 = fakeSearch(“web”)
Image1 = fakeSearch(“image”)
Image2 = fakeSearch(“image”)
Video1 = fakeSearch(“video”)
Video2 = fakeSearch(“video”)
)
type Result string
type Search func(query string) Result
func fakeSearch(kind string) Search {
return func(query string) Result {
time.Sleep(time.Duration(rand.Intn(100)) * time.Millisecond)
return Result(fmt.Sprintf("%s result for %q\n", kind, query))
}
}
func Google(query string) (results []Result) {
c := make(chan Result)
go func() { c <- First(query, Web1, Web2) } ()
go func() { c <- First(query, Image1, Image2) } ()
go func() { c <- First(query, Video1, Video2) } ()
timeout := time.After(80 * time.Millisecond)
for i := 0; i < 3; i++ {
select {
case result := <-c:
results = append(results, result)
case <-timeout:
fmt.Println(“timed out”)
return
}
}
return
}
func First(query string, replicas …Search) Result {
c := make(chan Result)
searchReplica := func(i int) { c <- replicasi }
for i := range replicas {
go searchReplica(i)
}
return <-c
}
func main() {
rand.Seed(time.Now().UnixNano())
start := time.Now()
results := Google(“golang”)
elapsed := time.Since(start)
fmt.Println(results)
fmt.Println(elapsed)
}
执行结果如下:
[image result for “golang”
web result for “golang”
video result for “golang”
]
53.605273ms