-
Notifications
You must be signed in to change notification settings - Fork 0
/
wayback.go
292 lines (239 loc) · 9.19 KB
/
wayback.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
package wayback
import (
"github.com/httpreserve/simplerequest"
"github.com/pkg/errors"
"net/http"
"net/url"
"strings"
"time"
)
const iaRoot = "http://web.archive.org"
const iaBeta = "http://web-beta.archive.org"
const iaSRoot = "https://web.archive.org"
const iaSBeta = "https://web-beta.archive.org"
const iaSave = "/save/" //e.g. https://web.archive.org/save/http://www.bbc.com/news
const iaWeb = "/web/" //e.g. http://web.archive.org/web/20161104020243/http://exponentialdecayxxxx.co.uk/#
// IsWayback checks a URL (string) and returns whether or not we expect it
// to be an internet archive link or not...
func IsWayback(link string) bool {
if strings.Contains(link, iaRoot) || strings.Contains(link, iaBeta) ||
strings.Contains(link, iaSRoot) || strings.Contains(link, iaSBeta) {
return true
}
return false
}
// Data stores information we're going to need for analysing what's
// in the internet archive. We need to follow a heuristic to use it most
// effectively. E.g. Use AlreadyWayback first to see what data we might
// have, NotWayback second to see if anything is already there...
type Data struct {
AlreadyWayback error // Flags the URL as a Wayback URL already
NotInWayback bool // Flags the URL as having zero entries in Wayback
EarliestWayback string // String denoting the Earliest Wayback URL
LatestWayback string // String denoting the Latest available Wayback URL
WaybackSaveURL string // String to handle saving of the link in Wayback
ResponseCode int // Response code from the Internet Archive
ResponseText string // Human readable response text from the Internet Archive
}
// ErrorNoIALink enables us to check for the non-existence of a record
var ErrorNoIALink = errors.New("no internet archive record")
// ErrorIAExists so that we can identify links we do not need to process
// a second time, or send to IA
var ErrorIAExists = errors.New("already an internet archive record")
// GetWaybackData returns some wayback information for the calling code in an
// appropriate struct... groups external functions conveniently, when calling
// externally, users can set their own agent string as required...
func GetWaybackData(link string, agent string) (Data, error) {
var wb Data
if !IsWayback(link) {
earliest, err := GetPotentialURLEarliest(link)
if err != nil {
return wb, errors.Wrap(err, "wayback url creation failed")
}
// We don't NotWaybackhave to be concerned with error here is URL is already
// previously Parsed correctly, which we do so dilligently under iafunctions.go
sr, err := simplerequest.Create(simplerequest.HEAD, earliest.String())
sr.Accept("*/*")
// Custom user agent...
if agent == "" {
sr.Agent(Version())
} else {
sr.Agent(agent)
}
sr.NoRedirect(true)
//set some values for the simplerequest...
sr.Timeout(5 * time.Second)
resp, err := sr.Do()
if err != nil {
return wb, errors.Wrap(err, "wayback request failed")
}
wb.ResponseCode = resp.StatusCode
wb.ResponseText = resp.StatusText
// First test for existence of an internet archive copy
if wb.ResponseCode == http.StatusNotFound {
if resp.Header.Get("Location") == "" {
wb.NotInWayback = true
return wb, nil
}
}
// Else, continue to retrieve IA links
// Try and get the latest link available in the archive...
wb.EarliestWayback = resp.Header.Get("Location")
// Reuse our previous SimpleRequest struct to redo the work...
sr.URL, _ = GetPotentialURLLatest(link)
resp, err = sr.Do()
if err != nil {
return wb, errors.Wrap(err, "wayback request failed")
}
// Add to our wayback structure...
wb.LatestWayback = resp.Header.Get("Location")
} else {
wb.AlreadyWayback = ErrorIAExists
}
wb.WaybackSaveURL = SaveURL(link)
return wb, nil
}
//Explanation: https://andrey.nering.com.br/2015/how-to-format-date-and-time-with-go-lang/
//Golang Date Formatter: http://fuckinggodateformat.com/
const datelayout = "20060102150405"
const humandate = "02 January 2006"
// GetPotentialURLLatest is used to create a URL that we can test for a 404
// error or 200 OK. The URL if it works can be used to display to
// the user for QA. The URL if it fails, can be used to prompt the
// user to save the URL as it is found today. A motivation, even if
// there is no saved IA record, to save copy today, even if it is a 404
// is that the earliest date we can pin on a broken link the
// better we can satisfy outselves in future that we did all we can.
// Example URI we need to create looks like this:
// web.archive.org/web/{date}/url-to-lookup
// {date} == "20161104020243" == "YYYYMMDDHHMMSS" == %Y%m%d%k%M%S
func GetPotentialURLLatest(archiveurl string) (*url.URL, error) {
latestDate := time.Now().Format(datelayout)
return constructURL(latestDate, archiveurl)
}
// GetPotentialURLEarliest is used to returning the
// earliest possible record available in the internet archive. We
// can make it easier by using this function here.
// Example URI we need to create looks like this:
// web.archive.org/web/{date}/url-to-lookup
func GetPotentialURLEarliest(archiveurl string) (*url.URL, error) {
oldestDate := time.Date(1900, time.August, 31, 23, 13, 0, 0, time.Local).Format(datelayout)
return constructURL(oldestDate, archiveurl)
}
const split1 = iaRoot + "/web/"
const split2 = iaBeta + "/web/"
const split3 = iaSRoot + "/web/"
const split4 = iaSBeta + "/web/"
var iasplits = []string{split1, split2, split3, split4}
// GetHumanDate returns a human readable date from an Internet Archive link
// rudimentary code for now. Can improve once we've got other pieces working.
func GetHumanDate(link string) string {
var dateslug string
for i := range iasplits {
if strings.Contains(link, iasplits[i]) {
r := strings.Split(link, iasplits[i])
if len(r) == 2 {
s := strings.Split(r[1], "/")
dateslug = s[0]
}
}
}
if dateslug != "" {
//latestDate := time.Now().Format(datelayout)
date, err := time.Parse(datelayout, dateslug)
if err != nil {
return ""
}
return date.Format(humandate)
}
return ""
}
// Construct the url to return to either the IA earliest or latest
// IA get functions and return...
func constructURL(iadate string, archiveurl string) (*url.URL, error) {
newurl, err := url.Parse(iaRoot + iaWeb + iadate + "/" + archiveurl)
if err != nil {
return newurl, errors.Wrap(err, "wayback url creation failed")
}
return newurl, nil
}
// SaveURL is used to create a URL that will enable us to
// submit it to the Internet Archive SaveNow function
func SaveURL(link string) string {
//e.g. https://web.archive.org/save/http://www.bbc.com/news
return iaRoot + iaSave + link
}
// SaveForbidden indecates that a robots.txt may be blocking the save
const SaveForbidden = "save forbidden by website"
// SaveGone indecates that the website may no longer exist at-all
const SaveGone = "bad gateway website maybe no more"
// SaveUnknown indicates an error in saving we haven't seen yet
const SaveUnknown = "unknown save error inspect response to improve code"
// SubmitToInternetArchive will handle the request and response to
// and from the Internet Archive for a URL that we wish to save as
// part of this initiative.
func SubmitToInternetArchive(link string, agent string) (simplerequest.SimpleResponse, error) {
// make savelink from url submitted
sl := SaveURL(link)
// We don't NotWaybackhave to be concerned with error here is URL is already
// previously Parsed correctly, which we do so dilligently under iafunctions.go
sr, err := simplerequest.Create(simplerequest.HEAD, sl)
if err != nil {
return simplerequest.SimpleResponse{}, err
}
sr.Accept("*/*")
// Custom user agent...
if agent == "" {
sr.Agent(Version())
} else {
sr.Agent(agent)
}
sr.NoRedirect(true)
//set some values for the simplerequest...
sr.Timeout(5 * time.Second)
resp, err := sr.Do()
if err != nil {
return simplerequest.SimpleResponse{}, errors.Wrap(err, "wayback save request failed")
}
if resp.StatusCode != 200 && resp.StatusCode != 302 {
switch resp.StatusCode {
case http.StatusBadGateway:
return resp, errors.New(SaveGone)
case http.StatusForbidden:
return resp, errors.New(SaveForbidden)
default:
return resp, errors.New(SaveUnknown)
}
}
return resp, nil
}
// GetSavedURL will help us to retrieve the URL returned by the
// Internet Archive when we've sent a request to the SaveNow function.
// We've constructed the URL to save ours in the Internet Archive
// We've submitted the URL via the IA REST API and we've receieved
// a 200 OK. In the response will be a partial SLUG that takes us
// to our newly archived record.
func GetSavedURL(resp http.Response) (*url.URL, error) {
loc := resp.Header["Content-Location"]
u, err := url.Parse(iaRoot + strings.Join(loc, ""))
if err != nil {
return &url.URL{}, errors.Wrap(err, "creation of URL from http response failed.")
}
return u, nil
}
// Retrieve the IA www link that we've been passing about
// from the IA response header sent to us previously.
func getWaybackfromRel(lnk string) string {
lnksplit := strings.Split(lnk, "; ")
for _, www := range lnksplit {
if strings.Contains(www, iaRoot) {
return www
}
}
return ""
}
var version = "httpreserve-wayback-0.0.1"
// Version retrieves the version text for the httpreserve/wayback agent
func Version() string {
return version
}