-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathScrape
executable file
·82 lines (69 loc) · 3.1 KB
/
Scrape
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env bash
set -e -o pipefail
basedir=$(cd "$(dirname "$0")" && pwd -P)
# We log to a file by default, but you can pass `-o /dev/stderr`
# to restore wget's original behaviour of logging to stderr.
logfile="log/scrape.$(date +%y%m%d-%H%M%S)"
# Obfuscated name/pw required for MLadm. See README for details.
login_name=zram
login_pw=gxinobky
####################################################################
# Functions
head_commit_author_timestamp() {
# Print author timestamp of current HEAD. We use the author
# rather than the commit timestamp because commits may be
# amended just to rebase or change the message, without fetching
# new files from the site as of that new commit timestamp.
git cat-file -p HEAD | grep '^author ' | head -1 \
| sed -e 's/ [^ ]*$//' -e 's/.* //'
}
set_timestamp() {
local ts="$1"; shift
find "$@" -type f -print0 | xargs -0 touch -d "@$ts"
}
# Not rot13, but same idea.
obfuscate() { echo "$@" | tr '[a-z]' '[g-za-f]'; }
unfuscate() { echo "$@" | tr '[g-za-f]' '[a-z]'; }
####################################################################
# Main
wget --version >/dev/null || {
echo 1>&2 "$(basename "$0"):" "'wget' not installed?"
exit 50 # wget uses error codes up to 10 or so
}
cd "$basedir"
mkdir -p "$(dirname "$logfile")"
# The ML.0, ML.1, etc. files are copies of ML/index.html, perhaps
# generated due to a bug. Always remove these before exit. This
# preserves the error code of the command just before the exit.
trap 'rm -f "$basedir"/lists.tlug.jp/ML.[0-9]' 0
# wget's -N option checks the file's timestamp against the server's.
# This causes a problem on a fresh checkout because the timestamps
# of all the files will be the current time, even if they are not
# actually the most recent version as of that time.
#
# We work around this by setting the timestamp of all files under
# lists.tlug.jp/ to the author timestamp of the current HEAD commit.
# This is not terribly efficient, but given that wget can take close
# to an hour to check the entire site even if it downloads nothing,
# this adds an insignificant amount of time to the process. (It's
# only a couple of seconds on a fast machine.)
#
# This will break if someone pushes a commit with an incorrect
# future author time.
#
set_timestamp $(head_commit_author_timestamp) lists.tlug.jp/
# Wget honours robots.txt for its own user agent (or perhaps `*`)
# regardless of what we set with the --user-agent option, so we have
# no way to ask it to use the "User-agent: htdig" section of the
# robots.txt (which allows /MLadm/) rather than the "User-agent: *"
# section (which disallows /MLadm/).
#
# We work around this by ignoring robots.txt and manually excluding
# the /cgi-bin/ and /mailman/ paths that robots.txt also disallows.
#
wget --no-verbose -o "$logfile" --backups=0 \
-e robots=off -X /cgi-bin/,/mailman/ \
$(: '--mirror options: recursive, infinite levels, timestamp checks') \
-r -l inf -N \
--user "$(unfuscate $login_name)" --password "$(unfuscate $login_pw)" \
"$@" https://lists.tlug.jp/