Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 104 additions & 64 deletions tumbdl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,110 +27,150 @@
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# along with this program. If not, see <https://www.gnu.org/licenses/>.

url=$1
targetDir=$2

# global curl options
# to disable progress bar, replace with -s
# to enable verbose mode, add -v
curlOptions='--progress-bar'
# to disable progress bar, replace with '-s'
# to enable verbose mode, add '-v'
curlOptions=( '--progress-bar' '-v' )
userAgent='Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0'

# check usage
if [ $# -ne 2 ]; then
echo "Usage: tumbdl [URL] [DIR]"
echo ""
echo "URL: URL of tumblelog, e.g. prostbote.tumblr.com"
echo "DIR: directory to put images in, e.g. prostbote"
exit
HELP='Usage: tumbdl <-c file> <-u string> [URL] [DIR]
URL: URL of tumblelog, e.g. prostbote.tumblr.com
DIR: directory to put images in, e.g. prostbote

Safemode options:
-c <cookie file>
-u <user agent>
Safemode works with a login cookie, part of this cookie is a hash that includes your user agent. Using Firefox ESR and an addon, you can get both of these things.
You can get your cookies by isntalling:
Firefox ESR: https://www.mozilla.org/en-US/firefox/organizations/all/
Add-on: https://addons.mozilla.org/en-US/firefox/addon/export-cookies/
Log into Tumblr, then from the "Tools" menu, select "Export cookies". Use the full path to the cookie file for the -c argument.

You can get your user agent by opening this URL:
https://www.google.com/search?q=what+is+my+user+agent+string'
cookieJar=True
while getopts c:u:h opts; do
case "${opts}" in
h)
echo -e "${HELP}"
exit 0
;;
c)
cookieFile="${OPTARG}"
cookieJar=False
if [ ! -e "${cookieFile}" ]; then
echo "Unable to find ${cookieFile}, please check and try again" >&2
exit 1
fi
;;
u)
userAgent="${OPTARG}"
;;
esac
done
shift $(( OPTIND - 1))
url="${1}"
targetDir="${2}"
if [ "${#}" -ne 2 ]; then
echo -e "${HELP}"
exit 1
fi

# sanitize input url
url=$(echo "$url" | sed 's/\/$//g')
url=$( sed 's/\/$//g' <<< "${url}")

# create target dir
mkdir "$targetDir"
touch "$targetDir/articles.txt"
mkdir "${targetDir}"
touch "${targetDir}/articles.txt"

# create cookie jar (not really needed atm)
cookieFile="$(mktemp 2>/dev/null || mktemp -t 'mytmpdir')"

if [ -z "${cookieFile}" ]; then
cookieFile="$(mktemp 2>/dev/null || mktemp -t 'mytmpdir')"
fi
# get first archive page
archiveLink="/archive/"

# loop over archive pages
endOfArchive=0
while [[ $endOfArchive -ne 1 ]]
while [[ ${endOfArchive} -ne 1 ]]
do
# get archive page
archivePage=$(curl $curlOptions -c $cookieFile --referer "http://$url" -A "$userAgent" "$url$archiveLink")
echo "Retrieving archive page $url$archiveLink..."
# If we are passed a cookie file, just use it right off, don't init it with -c
if [ "${cookieJar}" == 'True' ]; then
archivePage=$(curl "${curlOptions[@]}" -c "${cookieFile}" --referer "https://${url}" -A "${userAgent}" "${url}${archiveLink}")
else
archivePage=$(curl "${curlOptions[@]}" -b "${cookieFile}" --referer "https://${url}" -A "${userAgent}" "${url}${archiveLink}")
fi
echo "Retrieving archive page ${url}${archiveLink}..."

# extract links to posts
monthPosts=$(echo "$archivePage" | grep -o -P "/post/[0-9]*.*?\"" | sed 's/"//g')
monthPosts=$( grep -o -P "/post/[0-9]*.*?\"" <<< "${archivePage}" | sed 's/"//g')

# process all posts on this archive page
for postURL in $(echo "$monthPosts")
for postURL in ${monthPosts}
do
# check if post page has already been processed before
if grep -Fxq "$postURL" "$targetDir/articles.txt"
if grep -Fxq "${postURL}" "${targetDir}/articles.txt"
then
echo "Already got $url$postURL, skipping."
echo "Already got ${url}${postURL}, skipping."
else
# get the image links (can be multiple images in sets)
echo "Retrieving post $url$postURL..."
postPage=$(curl $curlOptions -b $cookieFile --referer "http://$url$archiveLink" -A "$userAgent" "$url$postURL")
imageLinks=$(echo "$postPage" | grep -o -P "http[s]*://([0-9]*.)?media\.tumblr\.com/([A-Za-z0-9]*/)?tumblr_[A-Za-z0-9]*_[0-9]*\.[a-z]*" | sort | uniq)
echo "Retrieving post ${url}${postURL}..."
postPage=$(curl "${curlOptions[@]}" -b "${cookieFile}" --referer "https://${url}${archiveLink}" -A "${userAgent}" "${url}${postURL}")
imageLinks=$( grep -o -P "http[s]*://([0-9]*.)?media\.tumblr\.com/([A-Za-z0-9]*/)?tumblr_[A-Za-z0-9]*_[0-9]*\.[a-z]*" <<< "${postPage}" | sort | uniq)
# remove resolution info from image filename
baseImages=$(echo "$imageLinks" | grep -o "tumblr_.*$" | sed 's/_[0-9]*\.\w*//g' | uniq)
baseImages=$( grep -o "tumblr_.*$" <<< "${imageLinks}" | sed 's/_[0-9]*\.\w*//g' | uniq)
# if we encounter any download errors, don't mark the post as archived
curlError=0

# determine the highest available resolution and download image
if [ ! -z "$baseImages" ]
if [ ! -z "${baseImages}" ]
then

for image in $(echo "$baseImages")
for image in ${baseImages}
do
# get the image name of image with highest resolution
maxResImage=$(echo "$imageLinks" | grep -o "$image.*" | sort -n | head -n 1)
maxResImage=$( grep -o "${image}.*" <<< "${imageLinks}" | sort -n | head -n 1)
# get full image url
maxResImageURL=$(echo "$imageLinks" | grep "$maxResImage")
maxResImageURL=$( grep "${maxResImage}" <<< "${imageLinks}")
# download image (if it doesn't exist)
if [ -e "$targetDir/$maxResImage" ]
if [ -e "${targetDir}/${maxResImage}" ]
then
echo "Image exists, skipping."
else
echo "Downloading image $maxResImageURL..."
curl $curlOptions -b $cookieFile --referer "http://$url$postURL" -A "$userAgent" -o "$targetDir/$maxResImage" "$maxResImageURL"
if [ ! 0 -eq $? ]; then curlError=1; fi;
echo "Downloading image ${maxResImageURL}..."
if ! curl "${curlOptions[@]}" -b "${cookieFile}" --referer "https://${url}${postURL}" -A "${userAgent}" -o "${targetDir}/${maxResImage}" "${maxResImageURL}" ;then
curlError=1
fi
fi
done
else
# no images found, check for video links
echo "No images found, checking for videos"

# check for tumblr hosted videos
videoPlayers=$(echo "$postPage" | grep -o -P "http[s]*://www.tumblr.com/video/.*/[0-9]*/[0-9]*/" | sort | uniq)
for video in $(echo "$videoPlayers")
videoPlayers=$( grep -o -P "http[s]*://www.tumblr.com/video/.*/[0-9]*/[0-9]*/" <<< "${postPage}" | sort | uniq)
for video in ${videoPlayers}
do
echo "Found tumblr-hosted video $video"
echo "Found tumblr-hosted video ${video}"
# get video link and type
videoSource=$(curl $curlOptions -b $cookieFile --referer "http://$url$postURL" -A "$userAgent" "$video" | grep -o -P "<source src=\"http[s]*://[^.]*.tumblr.com/video_file/.*?>")
videoSource=$(curl "${curlOptions[@]}" -b "${cookieFile}" --referer "https://${url}${postURL}" -A "${userAgent}" "${video}" | grep -o -P "<source src=\"http[s]*://[^.]*.tumblr.com/video_file/.*?>")
# get video url
videoURL=$(echo "$videoSource" | grep -o -P "http[s]*://[^.]*.tumblr.com/video_file/[[:0-9A-Za-z]*/]*[0-9]*/tumblr_[A-Za-z0-9]*")
videoURL=$( grep -o -P "http[s]*://[^.]*.tumblr.com/video_file/[[:0-9A-Za-z]*/]*[0-9]*/tumblr_[A-Za-z0-9]*" <<< "${videoSource}" )
# construct filename with extension from type string
videoFile=$(echo "$videoSource" | grep -o -P "tumblr_.*?>" | sed -e 's/<source src=\"//g' -e 's/\" type=\"video\//./g' -e 's/\">//g' -e 's/\//\_/g')
videoFile=$( grep -o -P "tumblr_.*?>" <<< "${videoSource}" | sed -e 's/<source src=\"//g' -e 's/\" type=\"video\//./g' -e 's/\">//g' -e 's/\//\_/g')
# download video (if it doesn't exist)
if [ -e "$targetDir/$videoFile" ]
if [ -e "${targetDir}/${videoFile}" ]
then
echo "Video exists, skipping."
else
echo "Downloading video $videoURL"
curl $curlOptions -L -b $cookieFile --referer "http://$url$postURL" -A "$userAgent" -o "$targetDir/$videoFile" "$videoURL"
if [ ! 0 -eq $? ]; then curlError=1; fi;
echo "Downloading video ${videoURL}"
if ! curl "${curlOptions[@]}" -L -b "${cookieFile}" --referer "https://${url}${postURL}" -A "${userAgent}" -o "${targetDir}/${videoFile}" "${videoURL}"; then
curlError=1
fi
fi
done
# check if youtube-dl is available
Expand All @@ -139,29 +179,29 @@ do
# gather embedded video urls
otherSource=""
# check for instagram video
otherSource=$(echo "$otherSource"; echo "$postPage" | grep -o -P "http[s]*://www.instagram.com/p/[A-Za-z0-9]*")
otherSource="${otherSource} $( grep -o -P "http[s]*://www.instagram.com/p/[A-Za-z0-9]*" <<< "${postPage}")"
# check fou youtube video
otherSource=$(echo "$otherSource"; echo "$postPage" | grep -o -P "http[s]*://www.youtube.com/embed/.*?\?" | sed 's/\?//g')
otherSource="${otherSource} $( grep -o -P "http[s]*://www.youtube.com/embed/.*?\?" | sed 's/\?//g' <<< "${postPage}" )"
# check for vine
otherSource=$(echo "$otherSource"; echo "$postPage" | grep -o -P "http[s]*://vine.co/v/.*?/")
otherSource="${otherSource} $( grep -o -P "http[s]*://vine.co/v/.*?/" <<< "${postPage}" )"
# check for vimeo
otherSource=$(echo "$otherSource"; echo "$postPage" | grep -o -P "http[s]*://player.vimeo.com/video/[0-9]*")
otherSource="${otherSource} $( grep -o -P "http[s]*://player.vimeo.com/video/[0-9]*" <<< "${postPage}" )"
# check for dailymotion
otherSource=$(echo "$otherSource"; echo "$postPage" | grep -o -P "http[s]*://www.dailymotion.com/embed/video/[A-Za-z0-9]*")
otherSource="${otherSource} $( grep -o -P "http[s]*://www.dailymotion.com/embed/video/[A-Za-z0-9]*" <<< "${postPage}" )"
# check for brightcove
otherSource=$(echo "$otherSource"; echo "$postPage" | grep -o -P "http[s]*://players.brightcove.net/.*/index.html\?videoId=[0-9]*")
otherSource="${otherSource} $( grep -o -P "http[s]*://players.brightcove.net/.*/index.html\?videoId=[0-9]*" <<< "${postPage}" )"
# add expressions for other video sites here like this:
#otherSource=$(echo "$otherSource"; echo "$postPage" | grep -o "http[s]*://www.example.com/embed/video/[A-Za-z0-9]*")
#otherSource="${otherSource} $( grep -o "http[s]*://www.example.com/embed/video/[A-Za-z0-9]*" <<< "${postPage}" )"

# if video links were found, try youtube-dl
if [ ! -z $otherSource ]
if [ ! -z "${otherSource}" ]
then
for otherVid in $(echo "$otherSource")
for otherVid in ${otherSource}
do
echo "Found embedded video $otherVid, attempting download via youtube-dl..."
youtube-dl "$otherVid" -o "$targetDir/%(title)s_%(duration)s.%(ext)s" -ciw
# if error occurs, don't mark post as archived
if [ ! 0 -eq $? ]; then curlError=1; fi;
if ! youtube-dl "$otherVid" -o "${targetDir}/%(title)s_%(duration)s.%(ext)s" -ciw ; then
curlError=1
fi
done
else
echo "No videos found, moving on."
Expand All @@ -172,23 +212,23 @@ do
fi

# if no error occured, enter page as downloaded
if [[ $curlError -eq 0 ]]
if [[ ${curlError} -eq 0 ]]
then
echo "$postURL" >> "$targetDir/articles.txt"
echo "${postURL}" >> "${targetDir}/articles.txt"
else
echo "Some error occured during downloading. No articles.txt entry created."
fi

fi
done
# get link to next archive page
archiveLink=$(echo "$archivePage" | grep -o -P "id=\"next_page_link\" href=\".*?\"" | sed -e 's/id=\"next_page_link\" href=\"//g' -e 's/\"//g')
archiveLink=$(echo "${archivePage}" | grep -o -P "id=\"next_page_link\" href=\".*?\"" | sed -e 's/id=\"next_page_link\" href=\"//g' -e 's/\"//g')
# check if we are at the end of the archive (no link is returned)
if [ -z "$archiveLink" ]
if [ -z "${archiveLink}" ]
then
endOfArchive=1
echo "Reached the last archive page. Done!"
else
echo "Next archive page: $url$archiveLink"
echo "Next archive page: ${url}${archiveLink}"
fi
done