Appearance
项目 4:并发爬虫(入门)
爬取网页数据
基本网页请求
go
package main
import (
"fmt"
"io"
"net/http"
"os"
)
func fetchURL(url string) (string, error) {
resp, err := http.Get(url)
if err != nil {
return "", err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
}
func main() {
url := "https://example.com"
content, err := fetchURL(url)
if err != nil {
fmt.Println("Error fetching URL:", err)
return
}
fmt.Println("Fetched content:", content[:100], "...")
// 保存到文件
err = os.WriteFile("example.html", []byte(content), 0644)
if err != nil {
fmt.Println("Error writing file:", err)
return
}
fmt.Println("Content saved to example.html")
}并发请求
基本并发爬虫
go
package main
import (
"fmt"
"io"
"net/http"
"sync"
)
func fetchURL(url string, wg *sync.WaitGroup, results chan<- string) {
defer wg.Done()
resp, err := http.Get(url)
if err != nil {
results <- fmt.Sprintf("Error fetching %s: %v", url, err)
return
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
results <- fmt.Sprintf("Error reading %s: %v", url, err)
return
}
results <- fmt.Sprintf("Fetched %s: %d bytes", url, len(body))
}
func main() {
urls := []string{
"https://example.com",
"https://golang.org",
"https://google.com",
"https://github.com",
"https://stackoverflow.com",
}
var wg sync.WaitGroup
results := make(chan string, len(urls))
for _, url := range urls {
wg.Add(1)
go fetchURL(url, &wg, results)
}
wg.Wait()
close(results)
for result := range results {
fmt.Println(result)
}
}数据保存
完整代码:并发爬虫
go
package main
import (
"bufio"
"encoding/csv"
"flag"
"fmt"
"io"
"net/http"
"os"
"sync"
)
type PageInfo struct {
URL string
Length int
Error string
}
func fetchURL(url string, results chan<- PageInfo) {
resp, err := http.Get(url)
if err != nil {
results <- PageInfo{URL: url, Error: err.Error()}
return
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
results <- PageInfo{URL: url, Error: err.Error()}
return
}
results <- PageInfo{URL: url, Length: len(body)}
}
func saveResults(results []PageInfo, filename string) error {
file, err := os.Create(filename)
if err != nil {
return err
}
defer file.Close()
writer := csv.NewWriter(file)
defer writer.Flush()
// 写入表头
err = writer.Write([]string{"URL", "Length", "Error"})
if err != nil {
return err
}
// 写入数据
for _, result := range results {
err = writer.Write([]string{
result.URL,
fmt.Sprintf("%d", result.Length),
result.Error,
})
if err != nil {
return err
}
}
return nil
}
func main() {
// 解析命令行参数
urlsFile := flag.String("urls", "", "File containing URLs to crawl")
output := flag.String("output", "results.csv", "Output CSV file")
flag.Parse()
var urls []string
if *urlsFile != "" {
// 从文件读取 URL
file, err := os.Open(*urlsFile)
if err != nil {
fmt.Println("Error opening URLs file:", err)
return
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
urls = append(urls, scanner.Text())
}
if err := scanner.Err(); err != nil {
fmt.Println("Error reading URLs file:", err)
return
}
} else {
// 默认 URL
urls = []string{
"https://example.com",
"https://golang.org",
"https://google.com",
"https://github.com",
"https://stackoverflow.com",
}
}
results := make(chan PageInfo, len(urls))
var wg sync.WaitGroup
for _, url := range urls {
wg.Add(1)
go func(u string) {
defer wg.Done()
fetchURL(u, results)
}(url)
}
wg.Wait()
close(results)
var allResults []PageInfo
for result := range results {
allResults = append(allResults, result)
if result.Error != "" {
fmt.Printf("Error: %s - %s\n", result.URL, result.Error)
} else {
fmt.Printf("Fetched: %s - %d bytes\n", result.URL, result.Length)
}
}
err := saveResults(allResults, *output)
if err != nil {
fmt.Println("Error saving results:", err)
return
}
fmt.Printf("Results saved to %s\n", *output)
}运行方式
使用默认 URL:
bash$ go run main.go Fetched: https://example.com - 1256 bytes Fetched: https://golang.org - 15000 bytes Fetched: https://google.com - 50000 bytes Fetched: https://github.com - 30000 bytes Fetched: https://stackoverflow.com - 40000 bytes Results saved to results.csv从文件读取 URL:
bash$ echo -e "https://example.com\nhttps://golang.org\nhttps://google.com" > urls.txt $ go run main.go -urls=urls.txt -output=crawl_results.csv
功能扩展
- 解析 HTML:使用
golang.org/x/net/html包解析 HTML - 提取链接:从页面中提取链接,实现深度爬取
- 并发控制:使用 channel 限制并发数量
- 超时控制:为 HTTP 请求设置超时
- 代理支持:使用代理服务器进行爬取
- 反爬策略:添加随机延迟、User-Agent 等
