编写一个Go的练习:写一个 Web 爬虫
一个Go语言之旅
我是Koyo,最近在面白法人カヤック写Go语言!感觉又到了下一批新入职者的时候了。我在想是不是新来的人们在学Go的时候会遇到 A Tour of Go 这个好时机呢。
虽然 A Tour of Go 对于学习Go是非常好的题材,但是其中的练习确实很困难(笑)(所以我认为新人们不需要全部都做出来。去年我自己也做不完)
特别是最后一个练习:Web Crawler 真的很难。通过一年的工作,我对这个也有了更深的理解,虽然也遇到了困难,但是我还是能写出不错的代码,于是我写了一篇文章!
代码 (daima)
package main
import (
"fmt"
"sync"
)
type Fetcher interface {
Fetch(url string) (body string, urls []string, err error)
}
type Crawler struct {
cache *sync.Map
}
func NewCrawler() *Crawler {
return &Crawler{
cache: &sync.Map{},
}
}
func (c *Crawler) Crawl(url string, depth int, fetcher Fetcher) {
wg := &sync.WaitGroup{}
c.crawl(url, depth, wg)
wg.Wait()
return
}
func (c *Crawler) crawl(url string, depth int, wg *sync.WaitGroup) {
if depth <= 0 {
return
}
if _, ok := c.cache.Load(url); ok {
return
}
c.cache.Store(url, struct{}{})
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
wg.Add(len(urls))
for _, u := range urls {
go func(u string) {
c.crawl(u, depth-1, wg)
wg.Done()
}(u)
}
}
func main() {
NewCrawler().Crawl("https://golang.org/", 4, fetcher)
}
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
执行结果
found: https://golang.org/ "The Go Programming Language"
not found: https://golang.org/cmd/
found: https://golang.org/pkg/ "Packages"
found: https://golang.org/pkg/os/ "Package os"
found: https://golang.org/pkg/fmt/ "Package fmt"
Program exited.
简单的解释
我试着使用 sync.WaitGroup 来定义 Crawler 方法。
我认为还有使用 channel 或通过闭包完成的方法。但使用闭包会降低可读性和扩展性,所以我更倾向于使用结构体定义。
在实施中的关键是使用sync.Map{}。我认为也可以使用sync.Mutex和map[string]bool的组合方式,但是使用这个方法只需要一个,所以很轻松。
建议在实现简单缓存时使用它!
如果我们进行严格考虑,并且因为返回错误的问题,可能最好使用 errgroup.Group 而不是 sync.WaitGroup,但我认为这只是作为一个实现示例的参考之一。
附加内容:尝试使用闭包解决
package main
import (
"fmt"
"sync"
)
type Fetcher interface {
Fetch(url string) (body string, urls []string, err error)
}
func Crawl(url string, depth int, fetcher Fetcher) {
var (
cache = &sync.Map{}
wg = &sync.WaitGroup{}
crawl func(url string, depth int)
)
crawl = func(url string, depth int) {
if depth <= 0 {
return
}
if _, ok := cache.Load(url); ok {
return
}
cache.Store(url, struct{}{})
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
wg.Add(len(urls))
for _, u := range urls {
go func(u string) {
crawl(u, depth-1)
wg.Done()
}(u)
}
}
crawl(url, depth)
wg.Wait()
return
}
func main() {
Crawl("https://golang.org/", 4, fetcher)
}
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}