package main
import (
"net/http"
"fmt"
"github.com/PuerkitoBio/goquery"
"strconv"
)
func GetMovie(url string) {
fmt.Println(url)
resp, err := http.Get(url)
if err != nil {
panic(err)
}
//bodyString, err := ioutil.ReadAll(resp.Body)
//fmt.Println(string(bodyString))
if resp.StatusCode != 200 {
fmt.Println("err")
}
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
panic(err)
}
//
doc.Find("#content h1").Each(func(i int, s *goquery.Selection) {
// name
fmt.Println("name:" + s.ChildrenFiltered(`[property="v:itemreviewed"]`).Text())
// year
fmt.Println("year:" + s.ChildrenFiltered(`.year`).Text())
})
// #info > span:nth-child(1) > span.attrs
director := ""
doc.Find("#info span:nth-child(1) span.attrs").Each(func(i int, s *goquery.Selection) {
// 导演
director += s.Text()
//fmt.Println(s.Text())
})
fmt.Println("导演:" + director)
//fmt.Println("n")
pl := ""
doc.Find("#info span:nth-child(3) span.attrs").Each(func(i int, s *goquery.Selection) {
pl += s.Text()
})
fmt.Println("编剧:" + pl)
charactor := ""
doc.Find("#info span.actor span.attrs").Each(func(i int, s *goquery.Selection) {
charactor += s.Text()
})
fmt.Println("主演:" + charactor)
typeStr := ""
doc.Find("#info > span:nth-child(8)").Each(func(i int, s *goquery.Selection) {
typeStr += s.Text()
})
fmt.Println("类型:" + typeStr)
}
func GetToplist(url string) []string {
var urls []string
resp, err := http.Get(url)
if err != nil {
panic(err)
}
//bodyString, err := ioutil.ReadAll(resp.Body)
//fmt.Println(string(bodyString))
if resp.StatusCode != 200 {
fmt.Println("err")
}
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
panic(err)
}
doc.Find("#content div div.article ol li div div.info div.hd a").Each(func(i int, s *goquery.Selection) {
// year
fmt.Printf("%v", s)
herf, _ := s.Attr("href")
urls = append(urls, herf)
})
return urls
}
func main() {
url := "https://movie.douban.com/top250?start="
var urls []string
var newUrl string
fmt.Println("%v", urls)
for i := 0; i < 10; i++ {
start := i * 25
newUrl = url + strconv.Itoa(start)
urls = GetToplist(newUrl)
for _, url := range urls {
GetMovie(url)
}
}
}
type Document struct {
*Selection
Url *url.URL
rootNode *html.Node
}
type Selection struct {
Nodes []*html.Node
document *Document
prevSel *Selection
}
– Eq() – First() – Get() – Index…() – Last() – Slice()2)扩大 SELECTION 集合(增加选择的节点)
– Add…() – AndSelf() – Union(), which is an alias for AddSelection()3)过滤方法,减少节点集合
– End() – Filter…() – Has…() – Intersection(), which is an alias of FilterSelection() – Not…()4)循环遍历选择的节点
– Each() – EachWithBreak() – Map()5)修改文档
– After…() – Append…() – Before…() – Clone() – Empty() – Prepend…() – Remove…() – ReplaceWith…() – Unwrap() – Wrap…() – WrapAll…() – WrapInner…()6)检测或获取节点属性值
– Attr(), RemoveAttr(), SetAttr() – AddClass(), HasClass(), RemoveClass(), ToggleClass() – Html() – Length() – Size(), which is an alias for Length() – Text()7)查询或显示一个节点的身份
– Contains() – Is…()8)在文档树之间来回跳转(常用的查找节点方法)
– Children…() – Contents() – Find…() – Next…() – Parent[s]…() – Prev…() – Siblings…()2.3 Matcher 接口
type Matcher interface {
Match(*html.Node) bool
MatchAll(*html.Node) []*html.Node
Filter([]*html.Node) []*html.Node
}
本文为 @ 21CTO 创作并授权 21CTO 发布,未经许可,请勿转载。
内容授权事宜请您联系 webmaster@21cto.com或关注 21CTO 公众号。
该文观点仅代表作者本人,21CTO 平台仅提供信息存储空间服务。