-
Notifications
You must be signed in to change notification settings - Fork 17
/
page.go
114 lines (96 loc) · 2.21 KB
/
page.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
package ant
import (
"fmt"
"io"
"io/ioutil"
"net/http"
"net/url"
"sync"
"github.com/yields/ant/internal/scan"
"github.com/yields/ant/internal/selectors"
"golang.org/x/net/html"
)
// Page represents a page.
type Page struct {
URL *url.URL
Header http.Header
body io.ReadCloser
root *html.Node
once sync.Once
err error
}
// Parse parses the page into a root node.
//
// If the root node is already parsed, or has
// errored, the method is a no-op.
func (p *Page) parse() error {
p.once.Do(func() {
if p.root, p.err = html.Parse(p.body); p.err != nil {
p.err = fmt.Errorf("ant: parse html %q - %w", p.URL, p.err)
}
p.close()
})
return p.err
}
// Query returns all nodes matching selector.
//
// The method returns an empty list if no nodes were found.
func (p *Page) Query(selector string) List {
var ret List
if err := p.parse(); err != nil {
return ret
}
if s, err := selectors.Compile(selector); err == nil {
ret = s.MatchAll(p.root)
}
return ret
}
// Text returns the text of the selected node.
//
// The method returns an empty string if the node is not found.
func (p *Page) Text(selector string) string {
return p.Query(selector).Text()
}
// URLs returns all URLs on the page.
//
// The method skips any invalid URLs.
func (p *Page) URLs() URLs {
return p.resolve(`a[href]`)
}
// Next all URLs matching the given selector.
func (p *Page) Next(selector string) (URLs, error) {
return p.resolve(selector), nil
}
// Scan scans data into the given value dst.
func (p *Page) Scan(dst interface{}) error {
if err := p.parse(); err != nil {
return err
}
return scanner.Scan(dst, p.root, scan.Options{})
}
// Resolve returns resolved URLs matching selector
func (p *Page) resolve(selector string) URLs {
var anchors = p.Query(selector)
var ret = make(URLs, 0, len(anchors))
for _, a := range anchors {
if href, ok := scan.Attr(a, "href"); ok {
u, err := url.Parse(href)
if err != nil {
continue
}
if !u.IsAbs() {
u = p.URL.ResolveReference(u)
}
switch u.Scheme {
case "https", "http":
ret = append(ret, u)
}
}
}
return ret
}
// Close closes the page's body.
func (p *Page) close() error {
io.Copy(ioutil.Discard, p.body)
return p.body.Close()
}