-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtalk.go
206 lines (177 loc) · 5 KB
/
talk.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
package speakerdeck
import (
"fmt"
"net/url"
"path"
"sort"
"strings"
"sync"
"github.com/gocolly/colly"
"github.com/luxas/speakerdeck-api/scraper"
log "github.com/sirupsen/logrus"
)
// ScrapeTalks returns either one sepecific talk if both userHandle and talkID are set, or a set of
// all the users' talks in detail if only userHandle is set. In opts you can set extensions
func ScrapeTalks(userHandle, talkID string, opts *scraper.ScrapeOptions) (Talks, error) {
if len(userHandle) == 0 {
return nil, fmt.Errorf("userHandle is mandatory!")
}
// If there was a specific talk given, look it up
if len(talkID) > 0 {
talkURL := fmt.Sprintf("%s/%s/%s", speakerdeckRootURL, userHandle, talkID)
data, err := scraper.Scrape(talkURL, &TalkScraper{}, opts)
if err != nil {
return nil, err
}
talk := data.(*Talk)
return []Talk{*talk}, nil
}
user, err := ScrapeUser(userHandle, opts)
if err != nil {
return nil, err
}
wg := &sync.WaitGroup{}
wg.Add(len(user.TalkPreviews))
mux := &sync.Mutex{}
talks := make([]Talk, 0, len(user.TalkPreviews))
for _, t := range user.TalkPreviews {
go func(talkPreview TalkPreview) {
defer wg.Done()
talkList, err := ScrapeTalks(user.Author.Handle, talkPreview.ID, opts)
if err != nil {
log.Errorf("could not get speakerdeck talk %s/%s", user.Author.Handle, talkPreview.ID)
return
}
mux.Lock()
talks = append(talks, talkList...)
mux.Unlock()
}(t)
}
wg.Wait()
sortedTalks := Talks(talks)
sort.Sort(sortedTalks)
return sortedTalks, nil
}
// TalkScraper implements scraper.Scraper
type TalkScraper struct{}
// Name returns the name of the TalkScraper
func (s *TalkScraper) Name() string {
return "TalkScraper"
}
// Hooks returns mappings between DOM paths in the scraped web pages, and handler functions to extract data out
// of them
func (s *TalkScraper) Hooks() []scraper.Hook {
return []scraper.Hook{
{
DOMPath: ".container h1.mb-4",
Handler: onTalkTitle,
},
{
DOMPath: ".col-auto.text-muted",
Handler: onTalkDate,
},
{
DOMPath: ".deck-description.mb-4 p",
Handler: onTalkDescription,
},
{
DOMPath: ".speakerdeck-embed",
Handler: onTalkDataID,
},
{
DOMPath: ".deck-meta .col-md-auto .row > div:nth-child(1) a",
Handler: onTalkCategory,
},
{
DOMPath: ".deck-meta .col-md-auto .row > div:nth-child(2) a",
Handler: onTalkStars,
},
{
DOMPath: ".deck-meta .col-md-auto .row > div:nth-child(3) span[title]",
Handler: onTalkViews,
},
{
DOMPath: ".deck-meta .col-md-auto .row > div:nth-child(4) a",
Handler: onTalkDownloadLink,
},
{
DOMPath: ".deck-meta .col-md-auto .row > a:nth-child(1)",
Handler: onTalkAuthor,
},
}
}
// InitialData returns the struct pointer passed around between the handler functions registered in Hooks()
// This pointer is passed as the second argument to all handlers. The handlers can cast it from interface{}
// to its real type, and modify its data.
func (s *TalkScraper) InitialData() interface{} {
return NewTalk()
}
func onTalkTitle(e *colly.HTMLElement, data interface{}) (*string, error) {
t := data.(*Talk)
t.Title = e.Text
return nil, nil
}
func onTalkDataID(e *colly.HTMLElement, data interface{}) (*string, error) {
t := data.(*Talk)
t.DataID = e.Attr("data-id")
return nil, nil
}
func onTalkDate(e *colly.HTMLElement, data interface{}) (*string, error) {
t := data.(*Talk)
d, err := parseDate(e.Text)
if err != nil {
return nil, err
}
t.Date = d
return nil, nil
}
func onTalkDescription(e *colly.HTMLElement, data interface{}) (*string, error) {
t := data.(*Talk)
links := linkRegexp.FindStringSubmatch(e.Text)
for _, link := range links {
parsedLink, err := url.Parse(link)
if err != nil {
log.Warnf("Could not parse link %q", link)
continue
}
t.ExtraLinks[parsedLink.Host] = append(t.ExtraLinks[parsedLink.Host], parsedLink.String())
}
if strings.Contains(e.Text, "Hide: true") {
t.Hide = true
}
return nil, nil
}
func onTalkCategory(e *colly.HTMLElement, data interface{}) (*string, error) {
t := data.(*Talk)
t.CategoryLink = sdPrefix(e.Attr("href"))
t.Category = strings.TrimSpace(e.Text)
return nil, nil
}
func onTalkStars(e *colly.HTMLElement, data interface{}) (*string, error) {
t := data.(*Talk)
var err error
t.Stars, err = parseNumber(e.Text)
return nil, err
}
func onTalkViews(e *colly.HTMLElement, data interface{}) (*string, error) {
t := data.(*Talk)
viewsStr := strings.TrimSuffix(e.Attr("title"), " views")
var err error
t.Views, err = parseNumber(viewsStr)
return nil, err
}
func onTalkDownloadLink(e *colly.HTMLElement, data interface{}) (*string, error) {
t := data.(*Talk)
t.DownloadLink = e.Attr("href")
return nil, nil
}
func onTalkAuthor(e *colly.HTMLElement, data interface{}) (*string, error) {
t := data.(*Talk)
t.Link = e.Request.URL.String()
t.ID = path.Base(t.Link)
t.Author.Link = sdPrefix(e.Attr("href"))
t.Author.Handle = path.Base(t.Author.Link)
t.Author.Name = strings.TrimSpace(e.Text)
t.Author.AvatarLink = httpsPrefix + e.ChildAttr("img", "src")
return nil, nil
}