Skip to content

项目 4:并发爬虫(入门)

爬取网页数据

基本网页请求

go
package main

import (
    "fmt"
    "io"
    "net/http"
    "os"
)

func fetchURL(url string) (string, error) {
    resp, err := http.Get(url)
    if err != nil {
        return "", err
    }
    defer resp.Body.Close()
    
    body, err := io.ReadAll(resp.Body)
    if err != nil {
        return "", err
    }
    
    return string(body), nil
}

func main() {
    url := "https://example.com"
    content, err := fetchURL(url)
    if err != nil {
        fmt.Println("Error fetching URL:", err)
        return
    }
    
    fmt.Println("Fetched content:", content[:100], "...")
    
    // 保存到文件
    err = os.WriteFile("example.html", []byte(content), 0644)
    if err != nil {
        fmt.Println("Error writing file:", err)
        return
    }
    fmt.Println("Content saved to example.html")
}

并发请求

基本并发爬虫

go
package main

import (
    "fmt"
    "io"
    "net/http"
    "sync"
)

func fetchURL(url string, wg *sync.WaitGroup, results chan<- string) {
    defer wg.Done()
    
    resp, err := http.Get(url)
    if err != nil {
        results <- fmt.Sprintf("Error fetching %s: %v", url, err)
        return
    }
    defer resp.Body.Close()
    
    body, err := io.ReadAll(resp.Body)
    if err != nil {
        results <- fmt.Sprintf("Error reading %s: %v", url, err)
        return
    }
    
    results <- fmt.Sprintf("Fetched %s: %d bytes", url, len(body))
}

func main() {
    urls := []string{
        "https://example.com",
        "https://golang.org",
        "https://google.com",
        "https://github.com",
        "https://stackoverflow.com",
    }
    
    var wg sync.WaitGroup
    results := make(chan string, len(urls))
    
    for _, url := range urls {
        wg.Add(1)
        go fetchURL(url, &wg, results)
    }
    
    wg.Wait()
    close(results)
    
    for result := range results {
        fmt.Println(result)
    }
}

数据保存

完整代码:并发爬虫

go
package main

import (
    "bufio"
    "encoding/csv"
    "flag"
    "fmt"
    "io"
    "net/http"
    "os"
    "sync"
)

type PageInfo struct {
    URL     string
    Length  int
    Error   string
}

func fetchURL(url string, results chan<- PageInfo) {
    resp, err := http.Get(url)
    if err != nil {
        results <- PageInfo{URL: url, Error: err.Error()}
        return
    }
    defer resp.Body.Close()
    
    body, err := io.ReadAll(resp.Body)
    if err != nil {
        results <- PageInfo{URL: url, Error: err.Error()}
        return
    }
    
    results <- PageInfo{URL: url, Length: len(body)}
}

func saveResults(results []PageInfo, filename string) error {
    file, err := os.Create(filename)
    if err != nil {
        return err
    }
    defer file.Close()
    
    writer := csv.NewWriter(file)
    defer writer.Flush()
    
    // 写入表头
    err = writer.Write([]string{"URL", "Length", "Error"})
    if err != nil {
        return err
    }
    
    // 写入数据
    for _, result := range results {
        err = writer.Write([]string{
            result.URL,
            fmt.Sprintf("%d", result.Length),
            result.Error,
        })
        if err != nil {
            return err
        }
    }
    
    return nil
}

func main() {
    // 解析命令行参数
    urlsFile := flag.String("urls", "", "File containing URLs to crawl")
    output := flag.String("output", "results.csv", "Output CSV file")
    flag.Parse()
    
    var urls []string
    
    if *urlsFile != "" {
        // 从文件读取 URL
        file, err := os.Open(*urlsFile)
        if err != nil {
            fmt.Println("Error opening URLs file:", err)
            return
        }
        defer file.Close()
        
        scanner := bufio.NewScanner(file)
        for scanner.Scan() {
            urls = append(urls, scanner.Text())
        }
        
        if err := scanner.Err(); err != nil {
            fmt.Println("Error reading URLs file:", err)
            return
        }
    } else {
        // 默认 URL
        urls = []string{
            "https://example.com",
            "https://golang.org",
            "https://google.com",
            "https://github.com",
            "https://stackoverflow.com",
        }
    }
    
    results := make(chan PageInfo, len(urls))
    var wg sync.WaitGroup
    
    for _, url := range urls {
        wg.Add(1)
        go func(u string) {
            defer wg.Done()
            fetchURL(u, results)
        }(url)
    }
    
    wg.Wait()
    close(results)
    
    var allResults []PageInfo
    for result := range results {
        allResults = append(allResults, result)
        if result.Error != "" {
            fmt.Printf("Error: %s - %s\n", result.URL, result.Error)
        } else {
            fmt.Printf("Fetched: %s - %d bytes\n", result.URL, result.Length)
        }
    }
    
    err := saveResults(allResults, *output)
    if err != nil {
        fmt.Println("Error saving results:", err)
        return
    }
    
    fmt.Printf("Results saved to %s\n", *output)
}

运行方式

  1. 使用默认 URL

    bash
    $ go run main.go
    Fetched: https://example.com - 1256 bytes
    Fetched: https://golang.org - 15000 bytes
    Fetched: https://google.com - 50000 bytes
    Fetched: https://github.com - 30000 bytes
    Fetched: https://stackoverflow.com - 40000 bytes
    Results saved to results.csv
  2. 从文件读取 URL

    bash
    $ echo -e "https://example.com\nhttps://golang.org\nhttps://google.com" > urls.txt
    $ go run main.go -urls=urls.txt -output=crawl_results.csv

功能扩展

  1. 解析 HTML:使用 golang.org/x/net/html 包解析 HTML
  2. 提取链接:从页面中提取链接,实现深度爬取
  3. 并发控制:使用 channel 限制并发数量
  4. 超时控制:为 HTTP 请求设置超时
  5. 代理支持:使用代理服务器进行爬取
  6. 反爬策略:添加随机延迟、User-Agent 等

© 2026 编程马·菜鸟教程 版权所有