goquery爬虫实践
/*获取URL范例*/ /* Find 查找获取当前匹配的每个元素的后代 Eq 选择第几个 Attr 获取对应的标签属性 AttrOr 获取对应的标签属性。这个可以设置第二个参数。获取的默认值 如果获取不到默认调用对应默认值 Each 遍历每一个元素 Text 获取当前对应的文本 Html 获取当前对象的标签 AddClass 添加 class 不过用来抓取有点鸡肋不知道为何要写这个 Children 返回所有子元素 Filter 过滤标签元素 Prev 获取上一个元素 Next 获取下一个元素 */ package main import ( "fmt" "log" "os" "regexp" "strconv" "github.com/PuerkitoBio/goquery" ) func getdata(ins int, ch chan int) { url := "" if ins == 1 { url = "https://colobu.com/categories/Go" } else { url = "https://colobu.com/categories/Go/page/" + strconv.Itoa(ins) + "/" } doc, err := goquery.NewDocument(url) if err != nil { log.Fatal(err) } // [译]利用 gopackage 进行包的捕获、注入和分析 doc.Find(".article-title").Each(func(i int, s *goquery.Selection) { a, _ := s.Attr("href") text := s.Text() a = "https://colobu.com" + a //htmls, _ := s.Html() fmt.Println("") fmt.Println("") fmt.Println(" 地址:" + a) fmt.Println(" 标题:" + text) /*text = strings.ReplaceAll(text, ":", " ") text = strings.ReplaceAll(text, "/", " ") text = strings.ReplaceAll(text, "\\", " ") text = strings.ReplaceAll(text, "?", " ") text = strings.ReplaceAll(text, "*", " ") */ reg := regexp.MustCompile(`:|\?|/|\*|<|>|"`) tilte := reg.ReplaceAllString(text, " ") docm, err := goquery.NewDocument(a) if err != nil { log.Fatal(err) } sstext := "" docm.Find(".article-entry").Each(func(ii int, ss *goquery.Selection) { sstext = ss.Text() }) //fmt.Println(" 正文:" + sstext) file, _ := os.OpenFile("./爬虫/第"+strconv.Itoa(ins)+"页 "+strconv.Itoa(i+1)+"篇 "+tilte+"页爬虫.txt", os.O_RDWR|os.O_TRUNC|os.O_CREATE, 0666) defer file.Close() file.Write([]byte(text + "\n正文:\n" + sstext + "\n\n\n")) fmt.Println(" ---------------------------------------------------------------------------- ") }) ch <- ins } func Doing(s, e int) { ch := make(chan int) for i := s; i <= e; i++ { go getdata(i, ch) } for i := s; i <= e; i++ { n := <-ch fmt.Printf("第%d页爬取完毕\n", n) } } func main() { var start, end int fmt.Println("输入起始页") fmt.Scan(&start) fmt.Println("输入终止页") fmt.Scan(&end) Doing(start, end) }
package main import ( "fmt" "io" "net/http" "os" "strconv" ) func HttpGet(url string) (res string, err error) { fmt.Println(url) resp, err1 := http.Get(url) if err1 != nil { err = err1 //fmt.Println(err) return } //fmt.Println(resp.Body) defer resp.Body.Close() buf := make([]byte, 4096) for { n, err2 := resp.Body.Read(buf) if n == 0 { fmt.Println("读取完毕") break } if err2 != nil && err2 != io.EOF { //fmt.Println(err2) err = err2 return } res += string(buf[:n]) } return } func working(start, end int) { fmt.Printf("正在爬取%d页面到%d页", start, end) for i := start; i <= end; i++ { url := "http://tieba.baidu.com/f?kw=%E5%88%AB%E5%85%8B&ie=utf-8&pn=" + strconv.Itoa((i-1)*50) //resp, err := http.Get(url) result, err := HttpGet(url) if err != nil { fmt.Println(err) continue } fmt.Println(result) file, err := os.Create("第" + strconv.Itoa(i) + "页面.html") if err != nil { fmt.Println(err) } file.WriteString(result) file.Close() } } func main() { var start, end int fmt.Println("请输入爬取的起始页(》=1):") fmt.Scan(&start) fmt.Println("请输入爬取的结束页(》=start):") fmt.Scan(&end) working(start, end) }
package main import ( "fmt" "io" "net/http" "os" "strconv" ) func HttpGet(url string) (res string, err error) { fmt.Println(url) resp, err1 := http.Get(url) if err1 != nil { err = err1 //fmt.Println(err) return } //fmt.Println(resp.Body) defer resp.Body.Close() buf := make([]byte, 4096) for { n, err2 := resp.Body.Read(buf) if n == 0 { //fmt.Println("读取完毕") break } if err2 != nil && err2 != io.EOF { //fmt.Println(err2) err = err2 return } res += string(buf[:n]) } return } func getdata(i int, ch chan int) { url := "http://tieba.baidu.com/f?kw=%E5%88%AB%E5%85%8B&ie=utf-8&pn=" + strconv.Itoa((i-1)*50) //resp, err := http.Get(url) fmt.Println("第" + strconv.Itoa(i) + "页面.html") result, err := HttpGet(url) if err != nil { fmt.Println(err) //continue } //fmt.Println(result) file, err := os.Create("第" + strconv.Itoa(i) + "页面.html") if err != nil { fmt.Println(err) } file.WriteString(result) file.Close() ch <- i } func working(s, e int) { ch := make(chan int) for i := s; i <= e; i++ { go getdata(i, ch) } for i := s; i <= e; i++ { n := <-ch fmt.Printf("第%d页爬取完毕\n", n) } } func main() { var start, end int fmt.Println("请输入爬取的起始页(》=1):") fmt.Scan(&start) fmt.Println("请输入爬取的结束页(》=start):") fmt.Scan(&end) working(start, end) }
文章题目:goquery爬虫实践
链接URL:http://scpingwu.com/article/ijehid.html