diff --git a/tumbdl.sh b/tumbdl.sh index c4f5799..478ba72 100755 --- a/tumbdl.sh +++ b/tumbdl.sh @@ -27,85 +27,124 @@ # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License -# along with this program. If not, see . +# along with this program. If not, see . -url=$1 -targetDir=$2 # global curl options -# to disable progress bar, replace with -s -# to enable verbose mode, add -v -curlOptions='--progress-bar' +# to disable progress bar, replace with '-s' +# to enable verbose mode, add '-v' +curlOptions=( '--progress-bar' '-v' ) userAgent='Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0' # check usage -if [ $# -ne 2 ]; then - echo "Usage: tumbdl [URL] [DIR]" - echo "" - echo "URL: URL of tumblelog, e.g. prostbote.tumblr.com" - echo "DIR: directory to put images in, e.g. prostbote" - exit +HELP='Usage: tumbdl <-c file> <-u string> [URL] [DIR] +URL: URL of tumblelog, e.g. prostbote.tumblr.com +DIR: directory to put images in, e.g. prostbote + +Safemode options: + -c + -u + Safemode works with a login cookie, part of this cookie is a hash that includes your user agent. Using Firefox ESR and an addon, you can get both of these things. + You can get your cookies by isntalling: + Firefox ESR: https://www.mozilla.org/en-US/firefox/organizations/all/ + Add-on: https://addons.mozilla.org/en-US/firefox/addon/export-cookies/ + Log into Tumblr, then from the "Tools" menu, select "Export cookies". Use the full path to the cookie file for the -c argument. + + You can get your user agent by opening this URL: + https://www.google.com/search?q=what+is+my+user+agent+string' +cookieJar=True +while getopts c:u:h opts; do + case "${opts}" in + h) + echo -e "${HELP}" + exit 0 + ;; + c) + cookieFile="${OPTARG}" + cookieJar=False + if [ ! -e "${cookieFile}" ]; then + echo "Unable to find ${cookieFile}, please check and try again" >&2 + exit 1 + fi + ;; + u) + userAgent="${OPTARG}" + ;; + esac +done +shift $(( OPTIND - 1)) +url="${1}" +targetDir="${2}" +if [ "${#}" -ne 2 ]; then + echo -e "${HELP}" + exit 1 fi - # sanitize input url -url=$(echo "$url" | sed 's/\/$//g') +url=$( sed 's/\/$//g' <<< "${url}") # create target dir -mkdir "$targetDir" -touch "$targetDir/articles.txt" +mkdir "${targetDir}" +touch "${targetDir}/articles.txt" # create cookie jar (not really needed atm) -cookieFile="$(mktemp 2>/dev/null || mktemp -t 'mytmpdir')" - +if [ -z "${cookieFile}" ]; then + cookieFile="$(mktemp 2>/dev/null || mktemp -t 'mytmpdir')" +fi # get first archive page archiveLink="/archive/" # loop over archive pages endOfArchive=0 -while [[ $endOfArchive -ne 1 ]] +while [[ ${endOfArchive} -ne 1 ]] do # get archive page - archivePage=$(curl $curlOptions -c $cookieFile --referer "http://$url" -A "$userAgent" "$url$archiveLink") - echo "Retrieving archive page $url$archiveLink..." + # If we are passed a cookie file, just use it right off, don't init it with -c + if [ "${cookieJar}" == 'True' ]; then + archivePage=$(curl "${curlOptions[@]}" -c "${cookieFile}" --referer "https://${url}" -A "${userAgent}" "${url}${archiveLink}") + else + archivePage=$(curl "${curlOptions[@]}" -b "${cookieFile}" --referer "https://${url}" -A "${userAgent}" "${url}${archiveLink}") + fi + echo "Retrieving archive page ${url}${archiveLink}..." # extract links to posts - monthPosts=$(echo "$archivePage" | grep -o -P "/post/[0-9]*.*?\"" | sed 's/"//g') + monthPosts=$( grep -o -P "/post/[0-9]*.*?\"" <<< "${archivePage}" | sed 's/"//g') # process all posts on this archive page - for postURL in $(echo "$monthPosts") + for postURL in ${monthPosts} do # check if post page has already been processed before - if grep -Fxq "$postURL" "$targetDir/articles.txt" + if grep -Fxq "${postURL}" "${targetDir}/articles.txt" then - echo "Already got $url$postURL, skipping." + echo "Already got ${url}${postURL}, skipping." else # get the image links (can be multiple images in sets) - echo "Retrieving post $url$postURL..." - postPage=$(curl $curlOptions -b $cookieFile --referer "http://$url$archiveLink" -A "$userAgent" "$url$postURL") - imageLinks=$(echo "$postPage" | grep -o -P "http[s]*://([0-9]*.)?media\.tumblr\.com/([A-Za-z0-9]*/)?tumblr_[A-Za-z0-9]*_[0-9]*\.[a-z]*" | sort | uniq) + echo "Retrieving post ${url}${postURL}..." + postPage=$(curl "${curlOptions[@]}" -b "${cookieFile}" --referer "https://${url}${archiveLink}" -A "${userAgent}" "${url}${postURL}") + imageLinks=$( grep -o -P "http[s]*://([0-9]*.)?media\.tumblr\.com/([A-Za-z0-9]*/)?tumblr_[A-Za-z0-9]*_[0-9]*\.[a-z]*" <<< "${postPage}" | sort | uniq) # remove resolution info from image filename - baseImages=$(echo "$imageLinks" | grep -o "tumblr_.*$" | sed 's/_[0-9]*\.\w*//g' | uniq) + baseImages=$( grep -o "tumblr_.*$" <<< "${imageLinks}" | sed 's/_[0-9]*\.\w*//g' | uniq) # if we encounter any download errors, don't mark the post as archived curlError=0 # determine the highest available resolution and download image - if [ ! -z "$baseImages" ] + if [ ! -z "${baseImages}" ] then - for image in $(echo "$baseImages") + for image in ${baseImages} do # get the image name of image with highest resolution - maxResImage=$(echo "$imageLinks" | grep -o "$image.*" | sort -n | head -n 1) + maxResImage=$( grep -o "${image}.*" <<< "${imageLinks}" | sort -n | head -n 1) # get full image url - maxResImageURL=$(echo "$imageLinks" | grep "$maxResImage") + maxResImageURL=$( grep "${maxResImage}" <<< "${imageLinks}") # download image (if it doesn't exist) - if [ -e "$targetDir/$maxResImage" ] + if [ -e "${targetDir}/${maxResImage}" ] then echo "Image exists, skipping." else - echo "Downloading image $maxResImageURL..." - curl $curlOptions -b $cookieFile --referer "http://$url$postURL" -A "$userAgent" -o "$targetDir/$maxResImage" "$maxResImageURL" - if [ ! 0 -eq $? ]; then curlError=1; fi; + echo "Downloading image ${maxResImageURL}..." + if ! curl "${curlOptions[@]}" -b "${cookieFile}" --referer "https://${url}${postURL}" -A "${userAgent}" -o "${targetDir}/${maxResImage}" "${maxResImageURL}" ;then + curlError=1 + fi fi done else @@ -113,24 +152,25 @@ do echo "No images found, checking for videos" # check for tumblr hosted videos - videoPlayers=$(echo "$postPage" | grep -o -P "http[s]*://www.tumblr.com/video/.*/[0-9]*/[0-9]*/" | sort | uniq) - for video in $(echo "$videoPlayers") + videoPlayers=$( grep -o -P "http[s]*://www.tumblr.com/video/.*/[0-9]*/[0-9]*/" <<< "${postPage}" | sort | uniq) + for video in ${videoPlayers} do - echo "Found tumblr-hosted video $video" + echo "Found tumblr-hosted video ${video}" # get video link and type - videoSource=$(curl $curlOptions -b $cookieFile --referer "http://$url$postURL" -A "$userAgent" "$video" | grep -o -P "") + videoSource=$(curl "${curlOptions[@]}" -b "${cookieFile}" --referer "https://${url}${postURL}" -A "${userAgent}" "${video}" | grep -o -P "") # get video url - videoURL=$(echo "$videoSource" | grep -o -P "http[s]*://[^.]*.tumblr.com/video_file/[[:0-9A-Za-z]*/]*[0-9]*/tumblr_[A-Za-z0-9]*") + videoURL=$( grep -o -P "http[s]*://[^.]*.tumblr.com/video_file/[[:0-9A-Za-z]*/]*[0-9]*/tumblr_[A-Za-z0-9]*" <<< "${videoSource}" ) # construct filename with extension from type string - videoFile=$(echo "$videoSource" | grep -o -P "tumblr_.*?>" | sed -e 's///g' -e 's/\//\_/g') + videoFile=$( grep -o -P "tumblr_.*?>" <<< "${videoSource}" | sed -e 's///g' -e 's/\//\_/g') # download video (if it doesn't exist) - if [ -e "$targetDir/$videoFile" ] + if [ -e "${targetDir}/${videoFile}" ] then echo "Video exists, skipping." else - echo "Downloading video $videoURL" - curl $curlOptions -L -b $cookieFile --referer "http://$url$postURL" -A "$userAgent" -o "$targetDir/$videoFile" "$videoURL" - if [ ! 0 -eq $? ]; then curlError=1; fi; + echo "Downloading video ${videoURL}" + if ! curl "${curlOptions[@]}" -L -b "${cookieFile}" --referer "https://${url}${postURL}" -A "${userAgent}" -o "${targetDir}/${videoFile}" "${videoURL}"; then + curlError=1 + fi fi done # check if youtube-dl is available @@ -139,29 +179,29 @@ do # gather embedded video urls otherSource="" # check for instagram video - otherSource=$(echo "$otherSource"; echo "$postPage" | grep -o -P "http[s]*://www.instagram.com/p/[A-Za-z0-9]*") + otherSource="${otherSource} $( grep -o -P "http[s]*://www.instagram.com/p/[A-Za-z0-9]*" <<< "${postPage}")" # check fou youtube video - otherSource=$(echo "$otherSource"; echo "$postPage" | grep -o -P "http[s]*://www.youtube.com/embed/.*?\?" | sed 's/\?//g') + otherSource="${otherSource} $( grep -o -P "http[s]*://www.youtube.com/embed/.*?\?" | sed 's/\?//g' <<< "${postPage}" )" # check for vine - otherSource=$(echo "$otherSource"; echo "$postPage" | grep -o -P "http[s]*://vine.co/v/.*?/") + otherSource="${otherSource} $( grep -o -P "http[s]*://vine.co/v/.*?/" <<< "${postPage}" )" # check for vimeo - otherSource=$(echo "$otherSource"; echo "$postPage" | grep -o -P "http[s]*://player.vimeo.com/video/[0-9]*") + otherSource="${otherSource} $( grep -o -P "http[s]*://player.vimeo.com/video/[0-9]*" <<< "${postPage}" )" # check for dailymotion - otherSource=$(echo "$otherSource"; echo "$postPage" | grep -o -P "http[s]*://www.dailymotion.com/embed/video/[A-Za-z0-9]*") + otherSource="${otherSource} $( grep -o -P "http[s]*://www.dailymotion.com/embed/video/[A-Za-z0-9]*" <<< "${postPage}" )" # check for brightcove - otherSource=$(echo "$otherSource"; echo "$postPage" | grep -o -P "http[s]*://players.brightcove.net/.*/index.html\?videoId=[0-9]*") + otherSource="${otherSource} $( grep -o -P "http[s]*://players.brightcove.net/.*/index.html\?videoId=[0-9]*" <<< "${postPage}" )" # add expressions for other video sites here like this: - #otherSource=$(echo "$otherSource"; echo "$postPage" | grep -o "http[s]*://www.example.com/embed/video/[A-Za-z0-9]*") + #otherSource="${otherSource} $( grep -o "http[s]*://www.example.com/embed/video/[A-Za-z0-9]*" <<< "${postPage}" )" # if video links were found, try youtube-dl - if [ ! -z $otherSource ] + if [ ! -z "${otherSource}" ] then - for otherVid in $(echo "$otherSource") + for otherVid in ${otherSource} do echo "Found embedded video $otherVid, attempting download via youtube-dl..." - youtube-dl "$otherVid" -o "$targetDir/%(title)s_%(duration)s.%(ext)s" -ciw - # if error occurs, don't mark post as archived - if [ ! 0 -eq $? ]; then curlError=1; fi; + if ! youtube-dl "$otherVid" -o "${targetDir}/%(title)s_%(duration)s.%(ext)s" -ciw ; then + curlError=1 + fi done else echo "No videos found, moving on." @@ -172,9 +212,9 @@ do fi # if no error occured, enter page as downloaded - if [[ $curlError -eq 0 ]] + if [[ ${curlError} -eq 0 ]] then - echo "$postURL" >> "$targetDir/articles.txt" + echo "${postURL}" >> "${targetDir}/articles.txt" else echo "Some error occured during downloading. No articles.txt entry created." fi @@ -182,13 +222,13 @@ do fi done # get link to next archive page - archiveLink=$(echo "$archivePage" | grep -o -P "id=\"next_page_link\" href=\".*?\"" | sed -e 's/id=\"next_page_link\" href=\"//g' -e 's/\"//g') + archiveLink=$(echo "${archivePage}" | grep -o -P "id=\"next_page_link\" href=\".*?\"" | sed -e 's/id=\"next_page_link\" href=\"//g' -e 's/\"//g') # check if we are at the end of the archive (no link is returned) - if [ -z "$archiveLink" ] + if [ -z "${archiveLink}" ] then endOfArchive=1 echo "Reached the last archive page. Done!" else - echo "Next archive page: $url$archiveLink" + echo "Next archive page: ${url}${archiveLink}" fi done