diff --git a/operators/bilibili.py b/operators/bilibili.py index 2a7cd96..68be7f7 100644 --- a/operators/bilibili.py +++ b/operators/bilibili.py @@ -15,8 +15,10 @@ def url_patterns(cls) -> list: ] async def main(self, url: str): - instance = bilibili.Bilibili() - info = await instance.extract_info_only(url, info_only=True) + instance = bilibili.Bilibili(url) + instance.prepare() + instance.extract() + info = instance.streams_sorted if not info and instance.dash_streams: info = [ { @@ -36,15 +38,16 @@ async def main(self, url: str): extra_info = {} - if hasattr(instance, 'video_cid'): - extra_info['cid'] = instance.video_cid + if hasattr(instance, 'vid'): + extra_info['cid'] = instance.vid if hasattr(instance, 'lyrics'): extra_info['lyrics'] = instance.lyrics if hasattr(instance, 'danmaku'): extra_info['danmaku'] = instance.danmaku - if hasattr(instance, 'duration_ms'): - extra_info['duration_ms'] = instance.duration_ms - else: - extra_info['duration_ms'] = 0 + # FIXME: no attr + # if hasattr(instance, 'duration_ms'): + # extra_info['duration_ms'] = instance.duration_ms + # else: + extra_info['duration_ms'] = 0 return {'streams': info, 'extractor': 'BiliBili', 'extra': extra_info} diff --git a/you_get/__init__.py b/you_get/__init__.py old mode 100755 new mode 100644 diff --git a/you_get/__main__.py b/you_get/__main__.py old mode 100755 new mode 100644 diff --git a/you_get/cli_wrapper/__init__.py b/you_get/cli_wrapper/__init__.py old mode 100755 new mode 100644 diff --git a/you_get/cli_wrapper/downloader/__init__.py b/you_get/cli_wrapper/downloader/__init__.py old mode 100755 new mode 100644 diff --git a/you_get/cli_wrapper/openssl/__init__.py b/you_get/cli_wrapper/openssl/__init__.py old mode 100755 new mode 100644 diff --git a/you_get/cli_wrapper/player/__init__.py b/you_get/cli_wrapper/player/__init__.py old mode 100755 new mode 100644 diff --git a/you_get/cli_wrapper/player/__main__.py b/you_get/cli_wrapper/player/__main__.py old mode 100755 new mode 100644 diff --git a/you_get/cli_wrapper/player/dragonplayer.py b/you_get/cli_wrapper/player/dragonplayer.py old mode 100755 new mode 100644 diff --git a/you_get/cli_wrapper/player/gnome_mplayer.py b/you_get/cli_wrapper/player/gnome_mplayer.py old mode 100755 new mode 100644 diff --git a/you_get/cli_wrapper/player/mplayer.py b/you_get/cli_wrapper/player/mplayer.py old mode 100755 new mode 100644 diff --git a/you_get/cli_wrapper/player/vlc.py b/you_get/cli_wrapper/player/vlc.py old mode 100755 new mode 100644 diff --git a/you_get/cli_wrapper/player/wmp.py b/you_get/cli_wrapper/player/wmp.py old mode 100755 new mode 100644 diff --git a/you_get/cli_wrapper/transcoder/__init__.py b/you_get/cli_wrapper/transcoder/__init__.py old mode 100755 new mode 100644 diff --git a/you_get/cli_wrapper/transcoder/ffmpeg.py b/you_get/cli_wrapper/transcoder/ffmpeg.py old mode 100755 new mode 100644 diff --git a/you_get/cli_wrapper/transcoder/libav.py b/you_get/cli_wrapper/transcoder/libav.py old mode 100755 new mode 100644 diff --git a/you_get/cli_wrapper/transcoder/mencoder.py b/you_get/cli_wrapper/transcoder/mencoder.py old mode 100755 new mode 100644 diff --git a/you_get/common.py b/you_get/common.py old mode 100755 new mode 100644 index 13180b7..8c565d7 --- a/you_get/common.py +++ b/you_get/common.py @@ -14,7 +14,6 @@ from http import cookiejar from importlib import import_module from urllib import request, parse, error -from aiohttp import ClientSession from .version import __version__ from .util import log, term @@ -137,6 +136,8 @@ output_filename = None auto_rename = False insecure = False +m3u8 = False +postfix = False fake_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # noqa @@ -341,10 +342,34 @@ def undeflate(data): return decompressobj.decompress(data)+decompressobj.flush() +# an http.client implementation of get_content() +# because urllib does not support "Connection: keep-alive" +def getHttps(host, url, headers, gzip=True, deflate=False, debuglevel=0): + import http.client + + conn = http.client.HTTPSConnection(host) + conn.set_debuglevel(debuglevel) + conn.request("GET", url, headers=headers) + resp = conn.getresponse() + + data = resp.read() + if gzip: + data = ungzip(data) + if deflate: + data = undeflate(data) + + return str(data, encoding='utf-8') + + # DEPRECATED in favor of get_content() def get_response(url, faker=False): logging.debug('get_response: %s' % url) - + ctx = None + if insecure: + # ignore ssl errors + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE # install cookies if cookies: opener = request.build_opener(request.HTTPCookieProcessor(cookies)) @@ -352,10 +377,10 @@ def get_response(url, faker=False): if faker: response = request.urlopen( - request.Request(url, headers=fake_headers), None + request.Request(url, headers=fake_headers), None, context=ctx, ) else: - response = request.urlopen(url) + response = request.urlopen(url, context=ctx) data = response.read() if response.info().get('Content-Encoding') == 'gzip': @@ -418,7 +443,7 @@ def urlopen_with_retry(*args, **kwargs): raise http_error -async def get_content(url, headers={}, decoded=True): +def get_content(url, headers={}, decoded=True): """Gets the content of a URL via sending a HTTP GET request. Args: @@ -432,35 +457,41 @@ async def get_content(url, headers={}, decoded=True): logging.debug('get_content: %s' % url) - async with ClientSession() as session: - async with session.get(url, headers = headers, cookies = cookies) as resp: - # req = request.Request(url, headers=headers) - # if cookies: - # cookies.add_cookie_header(req) - # req.headers.update(req.unredirected_hdrs) - - # response = urlopen_with_retry(req) - # data = response.read() - data = await resp.read() - - # Handle HTTP compression for gzip and deflate (zlib) - # content_encoding = resp.headers.get('Content-Encoding', '') - # if content_encoding == 'gzip': - # data = ungzip(data) - # elif content_encoding == 'deflate': - # data = undeflate(data) - - # Decode the response body - if decoded: - charset = match1( - resp.headers.get('Content-Type', ''), r'charset=([\w-]+)' - ) - if charset is not None: - data = data.decode(charset, 'ignore') - else: - data = data.decode('utf-8', 'ignore') + req = request.Request(url, headers=headers) + if cookies: + # NOTE: Do not use cookies.add_cookie_header(req) + # #HttpOnly_ cookies were not supported by CookieJar and MozillaCookieJar properly until python 3.10 + # See also: + # - https://github.com/python/cpython/pull/17471 + # - https://bugs.python.org/issue2190 + # Here we add cookies to the request headers manually + cookie_strings = [] + for cookie in list(cookies): + cookie_strings.append(cookie.name + '=' + cookie.value) + cookie_headers = {'Cookie': '; '.join(cookie_strings)} + req.headers.update(cookie_headers) + + response = urlopen_with_retry(req) + data = response.read() - return data + # Handle HTTP compression for gzip and deflate (zlib) + content_encoding = response.getheader('Content-Encoding') + if content_encoding == 'gzip': + data = ungzip(data) + elif content_encoding == 'deflate': + data = undeflate(data) + + # Decode the response body + if decoded: + charset = match1( + response.getheader('Content-Type', ''), r'charset=([\w-]+)' + ) + if charset is not None: + data = data.decode(charset, 'ignore') + else: + data = data.decode('utf-8', 'ignore') + + return data def post_content(url, headers={}, post_data={}, decoded=True, **kwargs): @@ -481,8 +512,17 @@ def post_content(url, headers={}, post_data={}, decoded=True, **kwargs): req = request.Request(url, headers=headers) if cookies: - cookies.add_cookie_header(req) - req.headers.update(req.unredirected_hdrs) + # NOTE: Do not use cookies.add_cookie_header(req) + # #HttpOnly_ cookies were not supported by CookieJar and MozillaCookieJar properly until python 3.10 + # See also: + # - https://github.com/python/cpython/pull/17471 + # - https://bugs.python.org/issue2190 + # Here we add cookies to the request headers manually + cookie_strings = [] + for cookie in list(cookies): + cookie_strings.append(cookie.name + '=' + cookie.value) + cookie_headers = {'Cookie': '; '.join(cookie_strings)} + req.headers.update(cookie_headers) if kwargs.get('post_data_raw'): post_data_enc = bytes(kwargs['post_data_raw'], 'utf-8') else: @@ -510,26 +550,18 @@ def post_content(url, headers={}, post_data={}, decoded=True, **kwargs): return data -async def url_size(url, faker=False, headers={}): - # if faker: - # response = urlopen_with_retry( - # request.Request(url, headers=fake_headers) - # ) - # elif headers: - # response = urlopen_with_retry(request.Request(url, headers=headers)) - # else: - # response = urlopen_with_retry(url) - - req_headers = {} - if faker : - req_headers = fake_headers - elif headers : - req_headers = headers - - async with ClientSession() as session: - async with session.get(url, headers = req_headers) as resp: - size = resp.headers['content-length'] - return int(size) if size is not None else float('inf') +def url_size(url, faker=False, headers={}): + if faker: + response = urlopen_with_retry( + request.Request(url, headers=fake_headers) + ) + elif headers: + response = urlopen_with_retry(request.Request(url, headers=headers)) + else: + response = urlopen_with_retry(url) + + size = response.headers['content-length'] + return int(size) if size is not None else float('inf') def urls_size(urls, faker=False, headers={}): @@ -548,74 +580,66 @@ def get_head(url, headers=None, get_method='HEAD'): return res.headers -async def url_info(url, faker=False, headers={}): +def url_info(url, faker=False, headers={}): logging.debug('url_info: %s' % url) - # if faker: - # response = urlopen_with_retry( - # request.Request(url, headers=fake_headers) - # ) - # elif headers: - # response = urlopen_with_retry(request.Request(url, headers=headers)) - # else: - # response = urlopen_with_retry(request.Request(url)) - - req_headers = {} - if faker : - req_headers = fake_headers - elif headers : - req_headers = headers - - async with ClientSession() as session: - async with session.get(url, headers = req_headers) as resp: - headers = resp.headers - - type = headers['content-type'] - if type == 'image/jpg; charset=UTF-8' or type == 'image/jpg': - type = 'audio/mpeg' # fix for netease - mapping = { - 'video/3gpp': '3gp', - 'video/f4v': 'flv', - 'video/mp4': 'mp4', - 'video/MP2T': 'ts', - 'video/quicktime': 'mov', - 'video/webm': 'webm', - 'video/x-flv': 'flv', - 'video/x-ms-asf': 'asf', - 'audio/mp4': 'mp4', - 'audio/mpeg': 'mp3', - 'audio/wav': 'wav', - 'audio/x-wav': 'wav', - 'audio/wave': 'wav', - 'image/jpeg': 'jpg', - 'image/png': 'png', - 'image/gif': 'gif', - 'application/pdf': 'pdf', - } - if type in mapping: - ext = mapping[type] - else: - type = None - if headers['content-disposition']: - try: - filename = parse.unquote( - r1(r'filename="?([^"]+)"?', headers['content-disposition']) - ) - if len(filename.split('.')) > 1: - ext = filename.split('.')[-1] - else: - ext = None - except: - ext = None + if faker: + response = urlopen_with_retry( + request.Request(url, headers=fake_headers) + ) + elif headers: + response = urlopen_with_retry(request.Request(url, headers=headers)) + else: + response = urlopen_with_retry(request.Request(url)) + + headers = response.headers + + type = headers['content-type'] + if type == 'image/jpg; charset=UTF-8' or type == 'image/jpg': + type = 'audio/mpeg' # fix for netease + mapping = { + 'video/3gpp': '3gp', + 'video/f4v': 'flv', + 'video/mp4': 'mp4', + 'video/MP2T': 'ts', + 'video/quicktime': 'mov', + 'video/webm': 'webm', + 'video/x-flv': 'flv', + 'video/x-ms-asf': 'asf', + 'audio/mp4': 'mp4', + 'audio/mpeg': 'mp3', + 'audio/wav': 'wav', + 'audio/x-wav': 'wav', + 'audio/wave': 'wav', + 'image/jpeg': 'jpg', + 'image/png': 'png', + 'image/gif': 'gif', + 'application/pdf': 'pdf', + } + if type in mapping: + ext = mapping[type] + else: + type = None + if headers['content-disposition']: + try: + filename = parse.unquote( + r1(r'filename="?([^"]+)"?', headers['content-disposition']) + ) + if len(filename.split('.')) > 1: + ext = filename.split('.')[-1] else: ext = None + except: + ext = None + else: + ext = None - if headers['transfer-encoding'] != 'chunked': - size = headers['content-length'] and int(headers['content-length']) - else: - size = None + if headers['transfer-encoding'] != 'chunked': + size = headers['content-length'] and int(headers['content-length']) + else: + size = None - return type, ext, size + return type, ext, size def url_locations(urls, faker=False, headers={}): @@ -985,9 +1009,22 @@ def download_urls( pass title = tr(get_filename(title)) + if postfix and 'vid' in kwargs: + title = "%s [%s]" % (title, kwargs['vid']) output_filename = get_output_filename(urls, title, ext, output_dir, merge) output_filepath = os.path.join(output_dir, output_filename) - + from fastapi.logger import logger + log.w( urls) + log.w( title) + log.w( ext) + log.w( total_size) + log.w( output_dir) + log.w( refer) + log.w( merge) + log.w( faker) + log.w( headers) + log.w( json.dumps(kwargs)) + print() if total_size: if not force and os.path.exists(output_filepath) and not auto_rename\ and (os.path.getsize(output_filepath) >= total_size * 0.9\ @@ -1341,7 +1378,13 @@ def download_main(download, download_playlist, urls, playlist, **kwargs): if re.match(r'https?://', url) is None: url = 'http://' + url - if playlist: + if m3u8: + if output_filename: + title = output_filename + else: + title = "m3u8file" + download_url_ffmpeg(url=url, title=title,ext = 'mp4',output_dir = '.') + elif playlist: download_playlist(url, **kwargs) else: download(url, **kwargs) @@ -1445,7 +1488,6 @@ def set_socks_proxy(proxy): proxy_info = proxy.split("@") socks_proxy_addrs = proxy_info[1].split(':') socks_proxy_auth = proxy_info[0].split(":") - print(socks_proxy_auth[0]+" "+socks_proxy_auth[1]+" "+socks_proxy_addrs[0]+" "+socks_proxy_addrs[1]) socks.set_default_proxy( socks.SOCKS5, socks_proxy_addrs[0], @@ -1456,7 +1498,6 @@ def set_socks_proxy(proxy): ) else: socks_proxy_addrs = proxy.split(':') - print(socks_proxy_addrs[0]+" "+socks_proxy_addrs[1]) socks.set_default_proxy( socks.SOCKS5, socks_proxy_addrs[0], @@ -1529,6 +1570,10 @@ def print_version(): '--no-caption', action='store_true', help='Do not download captions (subtitles, lyrics, danmaku, ...)' ) + download_grp.add_argument( + '--postfix', action='store_true', default=False, + help='Postfix downloaded files with unique identifiers' + ) download_grp.add_argument( '-f', '--force', action='store_true', default=False, help='Force overwriting existing files' @@ -1621,6 +1666,10 @@ def print_version(): download_grp.add_argument('--stream', help=argparse.SUPPRESS) download_grp.add_argument('--itag', help=argparse.SUPPRESS) + download_grp.add_argument('-m', '--m3u8', action='store_true', default=False, + help = 'download video using an m3u8 url') + + parser.add_argument('URL', nargs='*', help=argparse.SUPPRESS) args = parser.parse_args() @@ -1646,6 +1695,8 @@ def print_version(): global output_filename global auto_rename global insecure + global m3u8 + global postfix output_filename = args.output_filename extractor_proxy = args.extractor_proxy @@ -1667,6 +1718,9 @@ def print_version(): if args.cookies: load_cookies(args.cookies) + if args.m3u8: + m3u8 = True + caption = True stream_id = args.format or args.stream or args.itag if args.no_caption: @@ -1679,6 +1733,7 @@ def print_version(): # ignore ssl insecure = True + postfix = args.postfix if args.no_proxy: set_http_proxy('') @@ -1765,20 +1820,10 @@ def google_search(url): url = 'https://www.google.com/search?tbm=vid&q=%s' % parse.quote(keywords) page = get_content(url, headers=fake_headers) videos = re.findall( - r'

([^<]+)<', page + r'(https://www\.youtube\.com/watch\?v=[\w-]+)', page ) - vdurs = re.findall(r'([^<]+)<', page) - durs = [r1(r'(\d+:\d+)', unescape_html(dur)) for dur in vdurs] - print('Google Videos search:') - for v in zip(videos, durs): - print('- video: {} [{}]'.format( - unescape_html(v[0][1]), - v[1] if v[1] else '?' - )) - print('# you-get %s' % log.sprint(v[0][0], log.UNDERLINE)) - print() print('Best matched result:') - return(videos[0][0]) + return(videos[0]) def url_to_module(url): diff --git a/you_get/extractor.py b/you_get/extractor.py old mode 100755 new mode 100644 index ab9dbeb..a70db14 --- a/you_get/extractor.py +++ b/you_get/extractor.py @@ -59,28 +59,6 @@ def download_by_url(self, url, **kwargs): self.extract(**kwargs) self.download(**kwargs) - - async def extract_info_only(self, url, **kwargs): - self.url = url - self.vid = None - - # if 'extractor_proxy' in kwargs and kwargs['extractor_proxy']: - # set_proxy(parse_host(kwargs['extractor_proxy'])) - await self.prepare(**kwargs) - if self.out: - return - # if 'extractor_proxy' in kwargs and kwargs['extractor_proxy']: - # unset_proxy() - - try: - self.streams_sorted = [dict([('id', stream_type['id'])] + list(self.streams[stream_type['id']].items())) for stream_type in self.__class__.stream_types if stream_type['id'] in self.streams] - except: - self.streams_sorted = [dict([('itag', stream_type['itag'])] + list(self.streams[stream_type['itag']].items())) for stream_type in self.__class__.stream_types if stream_type['itag'] in self.streams] - - # self.extract(**kwargs) - - # self.download(**kwargs) - return self.streams_sorted def download_by_vid(self, vid, **kwargs): self.url = None @@ -260,8 +238,8 @@ def download(self, **kwargs): download_urls(urls, self.title, ext, total_size, headers=headers, output_dir=kwargs['output_dir'], merge=kwargs['merge'], - av=stream_id in self.dash_streams) - + av=stream_id in self.dash_streams, + vid=self.vid) if 'caption' not in kwargs or not kwargs['caption']: print('Skipping captions or danmaku.') return diff --git a/you_get/extractors/__init__.py b/you_get/extractors/__init__.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/acfun.py b/you_get/extractors/acfun.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/alive.py b/you_get/extractors/alive.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/archive.py b/you_get/extractors/archive.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/baidu.py b/you_get/extractors/baidu.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/bandcamp.py b/you_get/extractors/bandcamp.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/baomihua.py b/you_get/extractors/baomihua.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/bigthink.py b/you_get/extractors/bigthink.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/bilibili.py b/you_get/extractors/bilibili.py old mode 100755 new mode 100644 index ab24a82..1a13b61 --- a/you_get/extractors/bilibili.py +++ b/you_get/extractors/bilibili.py @@ -12,8 +12,12 @@ class Bilibili(VideoExtractor): # Bilibili media encoding options, in descending quality order. stream_types = [ - {'id': 'hdflv2', 'quality': 125, 'audio_quality': 30280, - 'container': 'FLV', 'video_resolution': '3840p', 'desc': '真彩 HDR'}, + {'id': 'hdflv2_8k', 'quality': 127, 'audio_quality': 30280, + 'container': 'FLV', 'video_resolution': '4320p', 'desc': '超高清 8K'}, + {'id': 'hdflv2_dolby', 'quality': 126, 'audio_quality': 30280, + 'container': 'FLV', 'video_resolution': '3840p', 'desc': '杜比视界'}, + {'id': 'hdflv2_hdr', 'quality': 125, 'audio_quality': 30280, + 'container': 'FLV', 'video_resolution': '2160p', 'desc': '真彩 HDR'}, {'id': 'hdflv2_4k', 'quality': 120, 'audio_quality': 30280, 'container': 'FLV', 'video_resolution': '2160p', 'desc': '超清 4K'}, {'id': 'flv_p60', 'quality': 116, 'audio_quality': 30280, @@ -68,10 +72,6 @@ def bilibili_headers(referer=None, cookie=None): def bilibili_api(avid, cid, qn=0): return 'https://api.bilibili.com/x/player/playurl?avid=%s&cid=%s&qn=%s&type=&otype=json&fnver=0&fnval=16&fourk=1' % (avid, cid, qn) - @staticmethod - def bilibili_api_v2(cid, bvid, qn=0): - return 'https://api.bilibili.com/x/player/playurl?cid=%s&qn=%s&type=&otype=json&fourk=1&bvid=%s&fnver=0&fnval=976' % (cid, qn, bvid) - @staticmethod def bilibili_audio_api(sid): return 'https://www.bilibili.com/audio/music-service-c/web/url?sid=%s' % sid @@ -116,12 +116,16 @@ def bilibili_live_room_init_api(room_id): def bilibili_space_channel_api(mid, cid, pn=1, ps=100): return 'https://api.bilibili.com/x/space/channel/video?mid=%s&cid=%s&pn=%s&ps=%s&order=0&jsonp=jsonp' % (mid, cid, pn, ps) + @staticmethod + def bilibili_series_archives_api(mid, sid, pn=1, ps=100): + return 'https://api.bilibili.com/x/series/archives?mid=%s&series_id=%s&pn=%s&ps=%s&only_normal=true&sort=asc&jsonp=jsonp' % (mid, sid, pn, ps) + @staticmethod def bilibili_space_favlist_api(fid, pn=1, ps=20): return 'https://api.bilibili.com/x/v3/fav/resource/list?media_id=%s&pn=%s&ps=%s&order=mtime&type=0&tid=0&jsonp=jsonp' % (fid, pn, ps) @staticmethod - def bilibili_space_video_api(mid, pn=1, ps=100): + def bilibili_space_video_api(mid, pn=1, ps=50): return "https://api.bilibili.com/x/space/arc/search?mid=%s&pn=%s&ps=%s&tid=0&keyword=&order=pubdate&jsonp=jsonp" % (mid, pn, ps) @staticmethod @@ -133,17 +137,19 @@ def bilibili_h_api(doc_id): return 'https://api.vc.bilibili.com/link_draw/v1/doc/detail?doc_id=%s' % doc_id @staticmethod - async def url_size(url, faker=False, headers={},err_value=0): + def url_size(url, faker=False, headers={},err_value=0): try: - return await url_size(url,faker,headers) + return url_size(url,faker,headers) except: return err_value - async def prepare(self, **kwargs): + def prepare(self, **kwargs): self.stream_qualities = {s['quality']: s for s in self.stream_types} + self.streams.clear() + self.dash_streams.clear() try: - html_content = await get_content(self.url, headers=self.bilibili_headers(referer=self.url)) + html_content = get_content(self.url, headers=self.bilibili_headers(referer=self.url)) except: html_content = '' # live always returns 400 (why?) #self.title = match1(html_content, @@ -154,7 +160,7 @@ async def prepare(self, **kwargs): avid = match1(self.url, r'/(av\d+)') or match1(self.url, r'/(BV\w+)') p = int(match1(self.url, r'/p(\d+)') or '1') self.url = 'https://www.bilibili.com/video/%s?p=%s' % (avid, p) - html_content = await get_content(self.url, headers=self.bilibili_headers()) + html_content = get_content(self.url, headers=self.bilibili_headers()) # redirect: bangumi/play/ss -> bangumi/play/ep # redirect: bangumi.bilibili.com/anime -> bangumi/play/ep @@ -164,12 +170,17 @@ async def prepare(self, **kwargs): initial_state = json.loads(initial_state_text) ep_id = initial_state['epList'][0]['id'] self.url = 'https://www.bilibili.com/bangumi/play/ep%s' % ep_id - html_content = await get_content(self.url, headers=self.bilibili_headers(referer=self.url)) + html_content = get_content(self.url, headers=self.bilibili_headers(referer=self.url)) # redirect: s elif re.match(r'https?://(www\.)?bilibili\.com/s/(.+)', self.url): self.url = 'https://www.bilibili.com/%s' % match1(self.url, r'/s/(.+)') - html_content = await get_content(self.url, headers=self.bilibili_headers()) + html_content = get_content(self.url, headers=self.bilibili_headers()) + + # redirect: festival + elif re.match(r'https?://(www\.)?bilibili\.com/festival/(.+)', self.url): + self.url = 'https://www.bilibili.com/video/%s' % match1(self.url, r'bvid=([^&]+)') + html_content = get_content(self.url, headers=self.bilibili_headers()) # sort it out if re.match(r'https?://(www\.)?bilibili\.com/audio/au(\d+)', self.url): @@ -182,48 +193,58 @@ async def prepare(self, **kwargs): sort = 'live' elif re.match(r'https?://vc\.bilibili\.com/video/(\d+)', self.url): sort = 'vc' - elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|(BV(\S+)))', self.url): + elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|(bv(\S+))|(BV(\S+)))', self.url): sort = 'video' elif re.match(r'https?://h\.?bilibili\.com/(\d+)', self.url): sort = 'h' else: - raise Exception('playlist not supported') self.download_playlist_by_url(self.url, **kwargs) return - self.duration_ms = None # regular video if sort == 'video': initial_state_text = match1(html_content, r'__INITIAL_STATE__=(.*?);\(function\(\)') # FIXME - self.duration_ms = 0#int(match1(html_content, r'\"timelength\":(\d+)')) initial_state = json.loads(initial_state_text) + playinfo_text = match1(html_content, r'__playinfo__=(.*?)', cont) @@ -33,13 +33,12 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg log.e('[Warning] Cookies needed.') post = json.loads(data.group(1)) - if 'edge_sidecar_to_children' in post['graphql']['shortcode_media']: - edges = post['graphql']['shortcode_media']['edge_sidecar_to_children']['edges'] - for edge in edges: - title = edge['node']['shortcode'] - image_url = edge['node']['display_url'] - if 'video_url' in edge['node']: - image_url = edge['node']['video_url'] + for item in post['items']: + code = item['code'] + carousel_media = item.get('carousel_media') or [item] + for i, media in enumerate(carousel_media): + title = '%s [%s]' % (code, i) + image_url = media['image_versions2']['candidates'][0]['url'] ext = image_url.split('?')[0].split('.')[-1] size = int(get_head(image_url)['Content-Length']) @@ -50,21 +49,20 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg ext=ext, total_size=size, output_dir=output_dir) - else: - title = post['graphql']['shortcode_media']['shortcode'] - image_url = post['graphql']['shortcode_media']['display_url'] - if 'video_url' in post['graphql']['shortcode_media']: - image_url = post['graphql']['shortcode_media']['video_url'] - ext = image_url.split('?')[0].split('.')[-1] - size = int(get_head(image_url)['Content-Length']) - print_info(site_info, title, ext, size) - if not info_only: - download_urls(urls=[image_url], - title=title, - ext=ext, - total_size=size, - output_dir=output_dir) + # download videos (if any) + if 'video_versions' in media: + video_url = media['video_versions'][0]['url'] + ext = video_url.split('?')[0].split('.')[-1] + size = int(get_head(video_url)['Content-Length']) + + print_info(site_info, title, ext, size) + if not info_only: + download_urls(urls=[video_url], + title=title, + ext=ext, + total_size=size, + output_dir=output_dir) site_info = "Instagram.com" download = instagram_download diff --git a/you_get/extractors/interest.py b/you_get/extractors/interest.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/iqilu.py b/you_get/extractors/iqilu.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/iqiyi.py b/you_get/extractors/iqiyi.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/iwara.py b/you_get/extractors/iwara.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/ixigua.py b/you_get/extractors/ixigua.py old mode 100755 new mode 100644 index 2f11e7f..b368b38 --- a/you_get/extractors/ixigua.py +++ b/you_get/extractors/ixigua.py @@ -18,121 +18,95 @@ } -def int_overflow(val): - maxint = 2147483647 - if not -maxint - 1 <= val <= maxint: - val = (val + (maxint + 1)) % (2 * (maxint + 1)) - maxint - 1 - return val - - -def unsigned_right_shitf(n, i): - if n < 0: - n = ctypes.c_uint32(n).value - if i < 0: - return -int_overflow(n << abs(i)) - return int_overflow(n >> i) - - -def get_video_url_from_video_id(video_id): - """Splicing URLs according to video ID to get video details""" - # from js - data = [""] * 256 - for index, _ in enumerate(data): - t = index - for i in range(8): - t = -306674912 ^ unsigned_right_shitf(t, 1) if 1 & t else unsigned_right_shitf(t, 1) - data[index] = t - - def tmp(): - rand_num = random.random() - path = "/video/urls/v/1/toutiao/mp4/{video_id}?r={random_num}".format(video_id=video_id, - random_num=str(rand_num)[2:]) - e = o = r = -1 - i, a = 0, len(path) - while i < a: - e = ord(path[i]) - i += 1 - if e < 128: - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ e)] - else: - if e < 2048: - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (192 | e >> 6 & 31))] - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))] - else: - if 55296 <= e < 57344: - e = (1023 & e) + 64 - i += 1 - o = 1023 & t.url(i) - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (240 | e >> 8 & 7))] - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 2 & 63))] - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | o >> 6 & 15 | (3 & e) << 4))] - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & o))] - else: - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (224 | e >> 12 & 15))] - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 6 & 63))] - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))] - - return "https://ib.365yg.com{path}&s={param}".format(path=path, param=unsigned_right_shitf(r ^ -1, 0)) - - while 1: - url = tmp() - if url.split("=")[-1][0] != "-": # 参数s不能为负数 - return url - - -def ixigua_download(url, output_dir='.', merge=True, info_only=False, **kwargs): +def ixigua_download(url, output_dir='.', merge=True, info_only=False, stream_id='', **kwargs): # example url: https://www.ixigua.com/i6631065141750268420/#mid=63024814422 - resp = urlopen_with_retry(request.Request(url)) + headers['cookie'] = "MONITOR_WEB_ID=7892c49b-296e-4499-8704-e47c1b15123; " \ + "ixigua-a-s=1; ttcid=af99669b6304453480454f1507011d5c234; BD_REF=1; " \ + "__ac_nonce=060d88ff000a75e8d17eb; __ac_signature=_02B4Z6wo100f01kX9ZpgAAIDAKIBBQUIPYT5F2WIAAPG2ad; " \ + "ttwid=1%7CcIsVF_3vqSIk4XErhPB0H2VaTxT0tdsTMRbMjrJOPN8%7C1624806049%7C08ce7dd6f7d20506a41ba0a331ef96a6505d96731e6ad9f6c8c709f53f227ab1; " + + resp = urlopen_with_retry(request.Request(url, headers=headers)) html = resp.read().decode('utf-8') _cookies = [] for c in resp.getheader('Set-Cookie').split("httponly,"): _cookies.append(c.strip().split(' ')[0]) - headers['cookie'] = ' '.join(_cookies) + headers['cookie'] += ' '.join(_cookies) - conf = loads(match1(html, r"window\.config = (.+);")) - if not conf: - log.e("Get window.config from url failed, url: {}".format(url)) + match_txt = match1(html, r"', html) - info = json.loads(data) - wid = info['props']['initialProps']['$wid'] - cookie = 'tt_webid=%s; tt_webid_v2=%s' % (wid, wid) - - # here's the cookie - headers['Cookie'] = cookie - - # try again - html = get_content(url, headers=headers) - data = r1(r'', html) + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0', + 'Accept-Encoding': 'gzip, deflate', + 'Accept': '*/*', + 'Connection': 'keep-alive' # important + } + + m = re.match('(https?://)?([^/]+)(/.*)', url) + host = m.group(2) + if host != 'www.tiktok.com': # non-canonical URL + html = getHttps(host, url, headers=headers, gzip=False) + url = r1(r'(https://www.tiktok.com/[^?"]+)', html) + # use canonical URL + m = re.match('(https?://)?([^/]+)(/.*)', url) + host = m.group(2) + + url = m.group(3).split('?')[0] + vid = url.split('/')[3] # should be a string of numbers + + html = getHttps(host, url, headers=headers) + + data = r1(r'window\[\'SIGI_STATE\'\]=(.*?);window\[\'SIGI_RETRY\'\]', html) or \ + r1(r'', html) info = json.loads(data) - wid = info['props']['initialProps']['$wid'] - cookie = 'tt_webid=%s; tt_webid_v2=%s' % (wid, wid) - - videoData = info['props']['pageProps']['itemInfo']['itemStruct'] - videoId = videoData['id'] - videoUrl = videoData['video']['downloadAddr'] - uniqueId = videoData['author'].get('uniqueId') - nickName = videoData['author'].get('nickname') - - title = '%s [%s]' % (nickName or uniqueId, videoId) - - # we also need the referer - headers['Referer'] = referUrl + downloadAddr = info['ItemModule'][vid]['video']['downloadAddr'] + author = info['ItemModule'][vid]['author'] # same as uniqueId + nickname = info['UserModule']['users'][author]['nickname'] + title = '%s [%s]' % (nickname or author, vid) - mime, ext, size = url_info(videoUrl, headers=headers) + mime, ext, size = url_info(downloadAddr, headers=headers) print_info(site_info, title, mime, size) if not info_only: - download_urls([videoUrl], title, ext, size, output_dir=output_dir, merge=merge, headers=headers) + download_urls([downloadAddr], title, ext, size, output_dir=output_dir, merge=merge, headers=headers) site_info = "TikTok.com" download = tiktok_download diff --git a/you_get/extractors/toutiao.py b/you_get/extractors/toutiao.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/tucao.py b/you_get/extractors/tucao.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/tudou.py b/you_get/extractors/tudou.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/tumblr.py b/you_get/extractors/tumblr.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/twitter.py b/you_get/extractors/twitter.py old mode 100755 new mode 100644 index 2346821..8c052ed --- a/you_get/extractors/twitter.py +++ b/you_get/extractors/twitter.py @@ -51,7 +51,12 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) api_content = get_content(api_url, headers={'authorization': authorization, 'x-guest-token': guest_token}) info = json.loads(api_content) - if 'extended_entities' in info['globalObjects']['tweets'][item_id]: + if item_id not in info['globalObjects']['tweets']: + # something wrong here + log.w(info['timeline']['instructions'][0]['addEntries']['entries'][0]['content']['item']['content']['tombstone']['tombstoneInfo']['richText']['text']) + return + + elif 'extended_entities' in info['globalObjects']['tweets'][item_id]: # if the tweet contains media, download them media = info['globalObjects']['tweets'][item_id]['extended_entities']['media'] diff --git a/you_get/extractors/ucas.py b/you_get/extractors/ucas.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/universal.py b/you_get/extractors/universal.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/veoh.py b/you_get/extractors/veoh.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/vimeo.py b/you_get/extractors/vimeo.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/vine.py b/you_get/extractors/vine.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/vk.py b/you_get/extractors/vk.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/w56.py b/you_get/extractors/w56.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/wanmen.py b/you_get/extractors/wanmen.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/xiami.py b/you_get/extractors/xiami.py deleted file mode 100755 index 16656ad..0000000 --- a/you_get/extractors/xiami.py +++ /dev/null @@ -1,215 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -__all__ = ['xiami_download'] - -from ..common import * - -from xml.dom.minidom import parseString -from urllib import parse - -def location_dec(str): - head = int(str[0]) - str = str[1:] - rows = head - cols = int(len(str)/rows) + 1 - - out = "" - full_row = len(str) % head - for c in range(cols): - for r in range(rows): - if c == (cols - 1) and r >= full_row: - continue - if r < full_row: - char = str[r*cols+c] - else: - char = str[cols*full_row+(r-full_row)*(cols-1)+c] - out += char - return parse.unquote(out).replace("^", "0") - -def xiami_download_lyric(lrc_url, file_name, output_dir): - lrc = get_content(lrc_url, headers=fake_headers) - filename = get_filename(file_name) - if len(lrc) > 0: - with open(output_dir + "/" + filename + '.lrc', 'w', encoding='utf-8') as x: - x.write(lrc) - -def xiami_download_pic(pic_url, file_name, output_dir): - from ..util.strings import get_filename - pic_url = pic_url.replace('_1', '') - pos = pic_url.rfind('.') - ext = pic_url[pos:] - pic = get_content(pic_url, headers=fake_headers, decoded=False) - if len(pic) > 0: - with open(output_dir + "/" + file_name.replace('/', '-') + ext, 'wb') as x: - x.write(pic) - -def xiami_download_song(sid, output_dir = '.', info_only = False): - xml = get_content('http://www.xiami.com/song/playlist/id/%s/object_name/default/object_id/0' % sid, headers=fake_headers) - doc = parseString(xml) - i = doc.getElementsByTagName("track")[0] - artist = i.getElementsByTagName("artist")[0].firstChild.nodeValue - album_name = i.getElementsByTagName("album_name")[0].firstChild.nodeValue - song_title = i.getElementsByTagName("name")[0].firstChild.nodeValue - url = location_dec(i.getElementsByTagName("location")[0].firstChild.nodeValue) - try: - lrc_url = i.getElementsByTagName("lyric")[0].firstChild.nodeValue - except: - pass - type_, ext, size = url_info(url, headers=fake_headers) - if not ext: - ext = 'mp3' - - print_info(site_info, song_title, ext, size) - if not info_only: - file_name = "%s - %s - %s" % (song_title, artist, album_name) - download_urls([url], file_name, ext, size, output_dir, headers=fake_headers) - try: - xiami_download_lyric(lrc_url, file_name, output_dir) - except: - pass - -def xiami_download_showcollect(cid, output_dir = '.', info_only = False): - html = get_content('http://www.xiami.com/song/showcollect/id/' + cid, headers=fake_headers) - collect_name = r1(r'(.*)', html) - - xml = get_content('http://www.xiami.com/song/playlist/id/%s/type/3' % cid, headers=fake_headers) - doc = parseString(xml) - output_dir = output_dir + "/" + "[" + collect_name + "]" - tracks = doc.getElementsByTagName("track") - track_nr = 1 - for i in tracks: - artist=album_name=song_title=url="" - try: - song_id = i.getElementsByTagName("song_id")[0].firstChild.nodeValue - artist = i.getElementsByTagName("artist")[0].firstChild.nodeValue - album_name = i.getElementsByTagName("album_name")[0].firstChild.nodeValue - song_title = i.getElementsByTagName("title")[0].firstChild.nodeValue - url = location_dec(i.getElementsByTagName("location")[0].firstChild.nodeValue) - except: - log.e("Song %s failed. [Info Missing] artist:%s, album:%s, title:%s, url:%s" % (song_id, artist, album_name, song_title, url)) - continue - try: - lrc_url = i.getElementsByTagName("lyric")[0].firstChild.nodeValue - except: - pass - type_, ext, size = url_info(url, headers=fake_headers) - if not ext: - ext = 'mp3' - - print_info(site_info, song_title, ext, size) - if not info_only: - file_name = "%02d.%s - %s - %s" % (track_nr, song_title, artist, album_name) - download_urls([url], file_name, ext, size, output_dir, headers=fake_headers) - try: - xiami_download_lyric(lrc_url, file_name, output_dir) - except: - pass - - track_nr += 1 - -def xiami_download_album(aid, output_dir='.', info_only=False): - xml = get_content('http://www.xiami.com/song/playlist/id/%s/type/1' % aid, headers=fake_headers) - album_name = r1(r'', xml) - artist = r1(r'', xml) - doc = parseString(xml) - output_dir = output_dir + "/%s - %s" % (artist, album_name) - track_list = doc.getElementsByTagName('trackList')[0] - tracks = track_list.getElementsByTagName("track") - track_nr = 1 - pic_exist = False - for i in tracks: -#in this xml track tag is used for both "track in a trackList" and track no -#dirty here - if i.firstChild.nodeValue is not None: - continue - song_title = i.getElementsByTagName("songName")[0].firstChild.nodeValue - url = location_dec(i.getElementsByTagName("location")[0].firstChild.nodeValue) - try: - lrc_url = i.getElementsByTagName("lyric")[0].firstChild.nodeValue - except: - pass - if not pic_exist: - pic_url = i.getElementsByTagName("pic")[0].firstChild.nodeValue - type_, ext, size = url_info(url, headers=fake_headers) - if not ext: - ext = 'mp3' - - print_info(site_info, song_title, ext, size) - if not info_only: - file_name = "%02d.%s" % (track_nr, song_title) - download_urls([url], file_name, ext, size, output_dir, headers=fake_headers) - try: - xiami_download_lyric(lrc_url, file_name, output_dir) - except: - pass - if not pic_exist: - xiami_download_pic(pic_url, 'cover', output_dir) - pic_exist = True - - track_nr += 1 - -def xiami_download_mv(url, output_dir='.', merge=True, info_only=False): - # FIXME: broken merge - page = get_content(url, headers=fake_headers) - title = re.findall('([^<]+)', page)[0] - vid, uid = re.findall(r'vid:"(\d+)",uid:"(\d+)"', page)[0] - api_url = 'http://cloud.video.taobao.com/videoapi/info.php?vid=%s&uid=%s' % (vid, uid) - result = get_content(api_url, headers=fake_headers) - doc = parseString(result) - video_url = doc.getElementsByTagName("video_url")[-1].firstChild.nodeValue - length = int(doc.getElementsByTagName("length")[-1].firstChild.nodeValue) - - v_urls = [] - k_start = 0 - total_size = 0 - while True: - k_end = k_start + 20000000 - if k_end >= length: k_end = length - 1 - v_url = video_url + '/start_%s/end_%s/1.flv' % (k_start, k_end) - try: - _, ext, size = url_info(v_url) - except: - break - v_urls.append(v_url) - total_size += size - k_start = k_end + 1 - - print_info(site_info, title, ext, total_size) - if not info_only: - download_urls(v_urls, title, ext, total_size, output_dir, merge=merge, headers=fake_headers) - -def xiami_download(url, output_dir='.', merge=True, info_only=False, **kwargs): -#albums - if re.match(r'http://www.xiami.com/album/\d+', url): - id = r1(r'http://www.xiami.com/album/(\d+)', url) - xiami_download_album(id, output_dir, info_only) - elif re.match(r'http://www.xiami.com/album/\w+', url): - page = get_content(url, headers=fake_headers) - album_id = re.search(r'rel="canonical"\s+href="http://www.xiami.com/album/([^"]+)"', page).group(1) - xiami_download_album(album_id, output_dir, info_only) - -#collections - if re.match(r'http://www.xiami.com/collect/\d+', url): - id = r1(r'http://www.xiami.com/collect/(\d+)', url) - xiami_download_showcollect(id, output_dir, info_only) - -#single track - if re.match(r'http://www.xiami.com/song/\d+\b', url): - id = r1(r'http://www.xiami.com/song/(\d+)', url) - xiami_download_song(id, output_dir, info_only) - elif re.match(r'http://www.xiami.com/song/\w+', url): - html = get_content(url, headers=fake_headers) - id = r1(r'rel="canonical" href="http://www.xiami.com/song/([^"]+)"', html) - xiami_download_song(id, output_dir, info_only) - - if re.match('http://www.xiami.com/song/detail/id/\d+', url): - id = r1(r'http://www.xiami.com/song/detail/id/(\d+)', url) - xiami_download_song(id, output_dir, info_only) - - if re.match('http://www.xiami.com/mv', url): - xiami_download_mv(url, output_dir, merge=merge, info_only=info_only) - -site_info = "Xiami.com" -download = xiami_download -download_playlist = playlist_not_supported("xiami") diff --git a/you_get/extractors/ximalaya.py b/you_get/extractors/ximalaya.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/xinpianchang.py b/you_get/extractors/xinpianchang.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/yinyuetai.py b/you_get/extractors/yinyuetai.py deleted file mode 100755 index 6c39540..0000000 --- a/you_get/extractors/yinyuetai.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python - -__all__ = ['yinyuetai_download', 'yinyuetai_download_by_id'] - -from ..common import * - -def yinyuetai_download_by_id(vid, title=None, output_dir='.', merge=True, info_only=False): - video_info = json.loads(get_html('http://www.yinyuetai.com/insite/get-video-info?json=true&videoId=%s' % vid)) - url_models = video_info['videoInfo']['coreVideoInfo']['videoUrlModels'] - url_models = sorted(url_models, key=lambda i: i['qualityLevel']) - url = url_models[-1]['videoUrl'] - type = ext = r1(r'\.(flv|mp4)', url) - _, _, size = url_info(url) - - print_info(site_info, title, type, size) - if not info_only: - download_urls([url], title, ext, size, output_dir, merge = merge) - -def yinyuetai_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - id = r1(r'http://\w+.yinyuetai.com/video/(\d+)', url) or \ - r1(r'http://\w+.yinyuetai.com/video/h5/(\d+)', url) - if not id: - yinyuetai_download_playlist(url, output_dir=output_dir, merge=merge, info_only=info_only) - return - - html = get_html(url, 'utf-8') - title = r1(r'<meta property="og:title"\s+content="([^"]+)"/>', html) or r1(r'<title>(.*)', html) - assert title - title = parse.unquote(title) - title = escape_file_path(title) - yinyuetai_download_by_id(id, title, output_dir, merge=merge, info_only=info_only) - -def yinyuetai_download_playlist(url, output_dir='.', merge=True, info_only=False, **kwargs): - playlist = r1(r'http://\w+.yinyuetai.com/playlist/(\d+)', url) - html = get_html(url) - data_ids = re.findall(r'data-index="\d+"\s*data-id=(\d+)', html) - for data_id in data_ids: - yinyuetai_download('http://v.yinyuetai.com/video/' + data_id, - output_dir=output_dir, merge=merge, info_only=info_only) - -site_info = "YinYueTai.com" -download = yinyuetai_download -download_playlist = yinyuetai_download_playlist diff --git a/you_get/extractors/yixia.py b/you_get/extractors/yixia.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/yizhibo.py b/you_get/extractors/yizhibo.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/youku.py b/you_get/extractors/youku.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/youtube.py b/you_get/extractors/youtube.py old mode 100755 new mode 100644 index 81b45ac..3e1c5ca --- a/you_get/extractors/youtube.py +++ b/you_get/extractors/youtube.py @@ -78,6 +78,7 @@ def s_to_sig(js, s): # - https://www.youtube.com/yts/jsbin/player_ias-vfl-jbnrr/da_DK/base.js # - https://www.youtube.com/s/player/0b643cd1/player_ias.vflset/sv_SE/base.js # - https://www.youtube.com/s/player/50e823fc/player_ias.vflset/sv_SE/base.js + # - https://www.youtube.com/s/player/3b5d5649/player_ias.vflset/sv_SE/base.js def tr_js(code): code = re.sub(r'function', r'def', code) # add prefix '_sig_' to prevent namespace pollution @@ -117,7 +118,9 @@ def tr_js(code): f2 = re.sub(r'\$', '_dollar', f2) code = code + 'global _sig_%s\n' % f2 + tr_js(f2def) - f1 = re.sub(r'(as|if|in|is|or)', r'_\1', f1) + # if f1 contains more than 2 characters, no need to do substitution + # FIXME: we probably shouldn't do any substitution here at all? + f1 = re.sub(r'^(as|if|in|is|or)$', r'_\1', f1) f1 = re.sub(r'\$', '_dollar', f1) code = code + '_sig=_sig_%s(s)' % f1 exec(code, globals(), locals()) @@ -141,6 +144,7 @@ def get_vid_from_url(url): """ return match1(url, r'youtu\.be/([^?/]+)') or \ match1(url, r'youtube\.com/embed/([^/?]+)') or \ + match1(url, r'youtube\.com/shorts/([^/?]+)') or \ match1(url, r'youtube\.com/v/([^/?]+)') or \ match1(url, r'youtube\.com/watch/([^/?]+)') or \ parse_query_param(url, 'v') or \ @@ -233,7 +237,7 @@ def prepare(self, **kwargs): except: # ytplayer_config = {args:{raw_player_response:ytInitialPlayerResponse}} - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});</script>', video_page).group(1)) stream_list = ytInitialPlayerResponse['streamingData']['formats'] #stream_list = ytInitialPlayerResponse['streamingData']['adaptiveFormats'] @@ -258,7 +262,7 @@ def prepare(self, **kwargs): # Parse video page instead video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});</script>', video_page).group(1)) self.title = ytInitialPlayerResponse["videoDetails"]["title"] if re.search('([^"]*/base\.js)"', video_page): diff --git a/you_get/extractors/zhanqi.py b/you_get/extractors/zhanqi.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/zhibo.py b/you_get/extractors/zhibo.py old mode 100755 new mode 100644 diff --git a/you_get/extractors/zhihu.py b/you_get/extractors/zhihu.py old mode 100755 new mode 100644 index 64f8142..1dceef5 --- a/you_get/extractors/zhihu.py +++ b/you_get/extractors/zhihu.py @@ -31,8 +31,8 @@ def zhihu_download(url, output_dir='.', merge=True, info_only=False, **kwargs): play_list = video_info["playlist"] # first High Definition - # second Second Standard Definition - # third ld. What is ld ? + # second Standard Definition + # third Low Definition # finally continue data = play_list.get("hd", play_list.get("sd", play_list.get("ld", None))) if not data: diff --git a/you_get/json_output.py b/you_get/json_output.py old mode 100755 new mode 100644 index c619576..09f546b --- a/you_get/json_output.py +++ b/you_get/json_output.py @@ -31,6 +31,7 @@ def output(video_extractor, pretty_print=True): if pretty_print: print(json.dumps(out, indent=4, ensure_ascii=False)) else: + print(json.dumps(out)) # a fake VideoExtractor object to save info diff --git a/you_get/processor/__init__.py b/you_get/processor/__init__.py old mode 100755 new mode 100644 diff --git a/you_get/processor/ffmpeg.py b/you_get/processor/ffmpeg.py old mode 100755 new mode 100644 index 11126c2..50e2c9f --- a/you_get/processor/ffmpeg.py +++ b/you_get/processor/ffmpeg.py @@ -93,7 +93,7 @@ def ffmpeg_concat_mp4_to_mpg(files, output='output.mpg'): # Use concat demuxer on FFmpeg >= 1.1 if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): concat_list = generate_concat_list(files, output) - params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', + params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '0', '-i', concat_list, '-c', 'copy'] params.extend(['--', output]) if subprocess.call(params, stdin=STDIN) == 0: @@ -149,7 +149,7 @@ def ffmpeg_concat_flv_to_mp4(files, output='output.mp4'): # Use concat demuxer on FFmpeg >= 1.1 if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): concat_list = generate_concat_list(files, output) - params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', + params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '0', '-i', concat_list, '-c', 'copy', '-bsf:a', 'aac_adtstoasc'] params.extend(['--', output]) @@ -203,7 +203,7 @@ def ffmpeg_concat_mp4_to_mp4(files, output='output.mp4'): # Use concat demuxer on FFmpeg >= 1.1 if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): concat_list = generate_concat_list(files, output) - params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', + params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '0', '-i', concat_list, '-c', 'copy', '-bsf:a', 'aac_adtstoasc'] params.extend(['--', output]) diff --git a/you_get/processor/join_flv.py b/you_get/processor/join_flv.py old mode 100755 new mode 100644 diff --git a/you_get/processor/join_mp4.py b/you_get/processor/join_mp4.py old mode 100755 new mode 100644 diff --git a/you_get/processor/join_ts.py b/you_get/processor/join_ts.py old mode 100755 new mode 100644 diff --git a/you_get/processor/rtmpdump.py b/you_get/processor/rtmpdump.py old mode 100755 new mode 100644 diff --git a/you_get/util/__init__.py b/you_get/util/__init__.py old mode 100755 new mode 100644 diff --git a/you_get/util/fs.py b/you_get/util/fs.py old mode 100755 new mode 100644 diff --git a/you_get/util/git.py b/you_get/util/git.py old mode 100755 new mode 100644 diff --git a/you_get/util/log.py b/you_get/util/log.py old mode 100755 new mode 100644 diff --git a/you_get/util/os.py b/you_get/util/os.py old mode 100755 new mode 100644 diff --git a/you_get/util/strings.py b/you_get/util/strings.py old mode 100755 new mode 100644 diff --git a/you_get/util/term.py b/you_get/util/term.py old mode 100755 new mode 100644 diff --git a/you_get/version.py b/you_get/version.py old mode 100755 new mode 100644 index f7daa7f..da7d3c3 --- a/you_get/version.py +++ b/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1545' +__version__ = '0.4.1612'