编写一个Go的练习:写一个 Web 爬虫

一个Go语言之旅

我是Koyo,最近在面白法人カヤック写Go语言!感觉又到了下一批新入职者的时候了。我在想是不是新来的人们在学Go的时候会遇到 A Tour of Go 这个好时机呢。
虽然 A Tour of Go 对于学习Go是非常好的题材,但是其中的练习确实很困难(笑)(所以我认为新人们不需要全部都做出来。去年我自己也做不完)
特别是最后一个练习:Web Crawler 真的很难。通过一年的工作,我对这个也有了更深的理解,虽然也遇到了困难,但是我还是能写出不错的代码,于是我写了一篇文章!

代码 (daima)

package main

import (
    "fmt"
    "sync"
)

type Fetcher interface {
    Fetch(url string) (body string, urls []string, err error)
}

type Crawler struct {
    cache *sync.Map
}

func NewCrawler() *Crawler {
    return &Crawler{
        cache: &sync.Map{},
    }
}

func (c *Crawler) Crawl(url string, depth int, fetcher Fetcher) {
    wg := &sync.WaitGroup{}
    c.crawl(url, depth, wg)
    wg.Wait()

    return
}

func (c *Crawler) crawl(url string, depth int, wg *sync.WaitGroup) {
    if depth <= 0 {
        return
    }

    if _, ok := c.cache.Load(url); ok {
        return
    }
    c.cache.Store(url, struct{}{})

    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println(err)
        return
    }

    fmt.Printf("found: %s %q\n", url, body)
    wg.Add(len(urls))
    for _, u := range urls {
        go func(u string) {
            c.crawl(u, depth-1, wg)
            wg.Done()
        }(u)
    }
}

func main() {
    NewCrawler().Crawl("https://golang.org/", 4, fetcher)
}

type fakeFetcher map[string]*fakeResult

type fakeResult struct {
    body string
    urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
    if res, ok := f[url]; ok {
        return res.body, res.urls, nil
    }
    return "", nil, fmt.Errorf("not found: %s", url)
}

var fetcher = fakeFetcher{
    "https://golang.org/": &fakeResult{
        "The Go Programming Language",
        []string{
            "https://golang.org/pkg/",
            "https://golang.org/cmd/",
        },
    },
    "https://golang.org/pkg/": &fakeResult{
        "Packages",
        []string{
            "https://golang.org/",
            "https://golang.org/cmd/",
            "https://golang.org/pkg/fmt/",
            "https://golang.org/pkg/os/",
        },
    },
    "https://golang.org/pkg/fmt/": &fakeResult{
        "Package fmt",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
    "https://golang.org/pkg/os/": &fakeResult{
        "Package os",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
}

执行结果

found: https://golang.org/ "The Go Programming Language"
not found: https://golang.org/cmd/
found: https://golang.org/pkg/ "Packages"
found: https://golang.org/pkg/os/ "Package os"
found: https://golang.org/pkg/fmt/ "Package fmt"

Program exited.

简单的解释

我试着使用 sync.WaitGroup 来定义 Crawler 方法。
我认为还有使用 channel 或通过闭包完成的方法。但使用闭包会降低可读性和扩展性,所以我更倾向于使用结构体定义。

在实施中的关键是使用sync.Map{}。我认为也可以使用sync.Mutex和map[string]bool的组合方式,但是使用这个方法只需要一个,所以很轻松。
建议在实现简单缓存时使用它!

如果我们进行严格考虑,并且因为返回错误的问题,可能最好使用 errgroup.Group 而不是 sync.WaitGroup,但我认为这只是作为一个实现示例的参考之一。

附加内容:尝试使用闭包解决

package main

import (
    "fmt"
    "sync"
)

type Fetcher interface {
    Fetch(url string) (body string, urls []string, err error)
}

func Crawl(url string, depth int, fetcher Fetcher) {
    var (
        cache = &sync.Map{}
        wg    = &sync.WaitGroup{}
        crawl func(url string, depth int)
    )

    crawl = func(url string, depth int) {
        if depth <= 0 {
            return
        }

        if _, ok := cache.Load(url); ok {
            return
        }
        cache.Store(url, struct{}{})

        body, urls, err := fetcher.Fetch(url)
        if err != nil {
            fmt.Println(err)
            return
        }

        fmt.Printf("found: %s %q\n", url, body)
        wg.Add(len(urls))
        for _, u := range urls {
            go func(u string) {
                crawl(u, depth-1)
                wg.Done()
            }(u)
        }
    }

    crawl(url, depth)
    wg.Wait()
    return
}

func main() {
    Crawl("https://golang.org/", 4, fetcher)
}

type fakeFetcher map[string]*fakeResult

type fakeResult struct {
    body string
    urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
    if res, ok := f[url]; ok {
        return res.body, res.urls, nil
    }
    return "", nil, fmt.Errorf("not found: %s", url)
}

var fetcher = fakeFetcher{
    "https://golang.org/": &fakeResult{
        "The Go Programming Language",
        []string{
            "https://golang.org/pkg/",
            "https://golang.org/cmd/",
        },
    },
    "https://golang.org/pkg/": &fakeResult{
        "Packages",
        []string{
            "https://golang.org/",
            "https://golang.org/cmd/",
            "https://golang.org/pkg/fmt/",
            "https://golang.org/pkg/os/",
        },
    },
    "https://golang.org/pkg/fmt/": &fakeResult{
        "Package fmt",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
    "https://golang.org/pkg/os/": &fakeResult{
        "Package os",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
}
bannerAds