diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index 5141159d27..5c1129a975 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -369,6 +369,8 @@ class YoutubeDL(object): _pps = [] _download_retcode = None _num_downloads = None + _playlist_level = 0 + _playlist_urls = set() _screen_file = None def __init__(self, params=None, auto_init=True): @@ -1012,113 +1014,23 @@ class YoutubeDL(object): return self.process_ie_result( new_result, download=download, extra_info=extra_info) elif result_type in ('playlist', 'multi_video'): - # We process each entry in the playlist - playlist = ie_result.get('title') or ie_result.get('id') - self.to_screen('[download] Downloading playlist: %s' % playlist) - - playlist_results = [] - - playliststart = self.params.get('playliststart', 1) - 1 - playlistend = self.params.get('playlistend') - # For backwards compatibility, interpret -1 as whole list - if playlistend == -1: - playlistend = None - - playlistitems_str = self.params.get('playlist_items') - playlistitems = None - if playlistitems_str is not None: - def iter_playlistitems(format): - for string_segment in format.split(','): - if '-' in string_segment: - start, end = string_segment.split('-') - for item in range(int(start), int(end) + 1): - yield int(item) - else: - yield int(string_segment) - playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) - - ie_entries = ie_result['entries'] - - def make_playlistitems_entries(list_ie_entries): - num_entries = len(list_ie_entries) - return [ - list_ie_entries[i - 1] for i in playlistitems - if -num_entries <= i - 1 < num_entries] - - def report_download(num_entries): + # Protect from infinite recursion due to recursively nested playlists + # (see https://github.com/ytdl-org/youtube-dl/issues/27833) + webpage_url = ie_result['webpage_url'] + if webpage_url in self._playlist_urls: self.to_screen( - '[%s] playlist %s: Downloading %d videos' % - (ie_result['extractor'], playlist, num_entries)) + '[download] Skipping already downloaded playlist: %s' + % ie_result.get('title') or ie_result.get('id')) + return - if isinstance(ie_entries, list): - n_all_entries = len(ie_entries) - if playlistitems: - entries = make_playlistitems_entries(ie_entries) - else: - entries = ie_entries[playliststart:playlistend] - n_entries = len(entries) - self.to_screen( - '[%s] playlist %s: Collected %d video ids (downloading %d of them)' % - (ie_result['extractor'], playlist, n_all_entries, n_entries)) - elif isinstance(ie_entries, PagedList): - if playlistitems: - entries = [] - for item in playlistitems: - entries.extend(ie_entries.getslice( - item - 1, item - )) - else: - entries = ie_entries.getslice( - playliststart, playlistend) - n_entries = len(entries) - report_download(n_entries) - else: # iterable - if playlistitems: - entries = make_playlistitems_entries(list(itertools.islice( - ie_entries, 0, max(playlistitems)))) - else: - entries = list(itertools.islice( - ie_entries, playliststart, playlistend)) - n_entries = len(entries) - report_download(n_entries) - - if self.params.get('playlistreverse', False): - entries = entries[::-1] - - if self.params.get('playlistrandom', False): - random.shuffle(entries) - - x_forwarded_for = ie_result.get('__x_forwarded_for_ip') - - for i, entry in enumerate(entries, 1): - self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) - # This __x_forwarded_for_ip thing is a bit ugly but requires - # minimal changes - if x_forwarded_for: - entry['__x_forwarded_for_ip'] = x_forwarded_for - extra = { - 'n_entries': n_entries, - 'playlist': playlist, - 'playlist_id': ie_result.get('id'), - 'playlist_title': ie_result.get('title'), - 'playlist_uploader': ie_result.get('uploader'), - 'playlist_uploader_id': ie_result.get('uploader_id'), - 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart, - 'extractor': ie_result['extractor'], - 'webpage_url': ie_result['webpage_url'], - 'webpage_url_basename': url_basename(ie_result['webpage_url']), - 'extractor_key': ie_result['extractor_key'], - } - - if self._match_entry(entry, incomplete=True) is not None: - continue - - entry_result = self.__process_iterable_entry(entry, download, extra) - # TODO: skip failed (empty) entries? - playlist_results.append(entry_result) - ie_result['entries'] = playlist_results - self.to_screen('[download] Finished downloading playlist: %s' % playlist) - return ie_result + self._playlist_level += 1 + self._playlist_urls.add(webpage_url) + try: + return self.__process_playlist(ie_result, download) + finally: + self._playlist_level -= 1 + if not self._playlist_level: + self._playlist_urls.clear() elif result_type == 'compat_list': self.report_warning( 'Extractor %s returned a compat_list result. ' @@ -1143,6 +1055,115 @@ class YoutubeDL(object): else: raise Exception('Invalid result type: %s' % result_type) + def __process_playlist(self, ie_result, download): + # We process each entry in the playlist + playlist = ie_result.get('title') or ie_result.get('id') + self.to_screen('[download] Downloading playlist: %s' % playlist) + + playlist_results = [] + + playliststart = self.params.get('playliststart', 1) - 1 + playlistend = self.params.get('playlistend') + # For backwards compatibility, interpret -1 as whole list + if playlistend == -1: + playlistend = None + + playlistitems_str = self.params.get('playlist_items') + playlistitems = None + if playlistitems_str is not None: + def iter_playlistitems(format): + for string_segment in format.split(','): + if '-' in string_segment: + start, end = string_segment.split('-') + for item in range(int(start), int(end) + 1): + yield int(item) + else: + yield int(string_segment) + playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) + + ie_entries = ie_result['entries'] + + def make_playlistitems_entries(list_ie_entries): + num_entries = len(list_ie_entries) + return [ + list_ie_entries[i - 1] for i in playlistitems + if -num_entries <= i - 1 < num_entries] + + def report_download(num_entries): + self.to_screen( + '[%s] playlist %s: Downloading %d videos' % + (ie_result['extractor'], playlist, num_entries)) + + if isinstance(ie_entries, list): + n_all_entries = len(ie_entries) + if playlistitems: + entries = make_playlistitems_entries(ie_entries) + else: + entries = ie_entries[playliststart:playlistend] + n_entries = len(entries) + self.to_screen( + '[%s] playlist %s: Collected %d video ids (downloading %d of them)' % + (ie_result['extractor'], playlist, n_all_entries, n_entries)) + elif isinstance(ie_entries, PagedList): + if playlistitems: + entries = [] + for item in playlistitems: + entries.extend(ie_entries.getslice( + item - 1, item + )) + else: + entries = ie_entries.getslice( + playliststart, playlistend) + n_entries = len(entries) + report_download(n_entries) + else: # iterable + if playlistitems: + entries = make_playlistitems_entries(list(itertools.islice( + ie_entries, 0, max(playlistitems)))) + else: + entries = list(itertools.islice( + ie_entries, playliststart, playlistend)) + n_entries = len(entries) + report_download(n_entries) + + if self.params.get('playlistreverse', False): + entries = entries[::-1] + + if self.params.get('playlistrandom', False): + random.shuffle(entries) + + x_forwarded_for = ie_result.get('__x_forwarded_for_ip') + + for i, entry in enumerate(entries, 1): + self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) + # This __x_forwarded_for_ip thing is a bit ugly but requires + # minimal changes + if x_forwarded_for: + entry['__x_forwarded_for_ip'] = x_forwarded_for + extra = { + 'n_entries': n_entries, + 'playlist': playlist, + 'playlist_id': ie_result.get('id'), + 'playlist_title': ie_result.get('title'), + 'playlist_uploader': ie_result.get('uploader'), + 'playlist_uploader_id': ie_result.get('uploader_id'), + 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart, + 'extractor': ie_result['extractor'], + 'webpage_url': ie_result['webpage_url'], + 'webpage_url_basename': url_basename(ie_result['webpage_url']), + 'extractor_key': ie_result['extractor_key'], + } + + if self._match_entry(entry, incomplete=True) is not None: + continue + + entry_result = self.__process_iterable_entry(entry, download, extra) + # TODO: skip failed (empty) entries? + playlist_results.append(entry_result) + ie_result['entries'] = playlist_results + self.to_screen('[download] Finished downloading playlist: %s' % playlist) + return ie_result + @__handle_extraction_exceptions def __process_iterable_entry(self, entry, download, extra_info): return self.process_ie_result( diff --git a/youtube_dlc/extractor/adn.py b/youtube_dlc/extractor/adn.py index c95ad21735..d611ee2374 100644 --- a/youtube_dlc/extractor/adn.py +++ b/youtube_dlc/extractor/adn.py @@ -10,6 +10,7 @@ import random from .common import InfoExtractor from ..aes import aes_cbc_decrypt from ..compat import ( + compat_HTTPError, compat_b64decode, compat_ord, ) @@ -18,11 +19,13 @@ from ..utils import ( bytes_to_long, ExtractorError, float_or_none, + int_or_none, intlist_to_bytes, long_to_bytes, pkcs1pad, strip_or_none, - urljoin, + try_get, + unified_strdate, ) @@ -31,16 +34,27 @@ class ADNIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P\d+)' _TEST = { 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', - 'md5': 'e497370d847fd79d9d4c74be55575c7a', + 'md5': '0319c99885ff5547565cacb4f3f9348d', 'info_dict': { 'id': '7778', 'ext': 'mp4', - 'title': 'Blue Exorcist - Kyôto Saga - Épisode 1', + 'title': 'Blue Exorcist - Kyôto Saga - Episode 1', 'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5', + 'series': 'Blue Exorcist - Kyôto Saga', + 'duration': 1467, + 'release_date': '20170106', + 'comment_count': int, + 'average_rating': float, + 'season_number': 2, + 'episode': 'Début des hostilités', + 'episode_number': 1, } } + _BASE_URL = 'http://animedigitalnetwork.fr' - _RSA_KEY = (0xc35ae1e4356b65a73b551493da94b8cb443491c0aa092a357a5aee57ffc14dda85326f42d716e539a34542a0d3f363adf16c5ec222d713d5997194030ee2e4f0d1fb328c01a81cf6868c090d50de8e169c6b13d1675b9eeed1cbc51e1fffca9b38af07f37abd790924cd3bee59d0257cfda4fe5f3f0534877e21ce5821447d1b, 65537) + _API_BASE_URL = 'https://gw.api.animedigitalnetwork.fr/' + _PLAYER_BASE_URL = _API_BASE_URL + 'player/' + _RSA_KEY = (0x9B42B08905199A5CCE2026274399CA560ECB209EE9878A708B1C0812E1BB8CB5D1FB7441861147C1A1F2F3A0476DD63A9CAC20D3E983613346850AA6CB38F16DC7D720FD7D86FC6E5B3D5BBC72E14CD0BF9E869F2CEA2CCAD648F1DCE38F1FF916CEFB2D339B64AA0264372344BC775E265E8A852F88144AB0BD9AA06C1A4ABB, 65537) _POS_ALIGN_MAP = { 'start': 1, 'end': 3, @@ -54,26 +68,24 @@ class ADNIE(InfoExtractor): def _ass_subtitles_timecode(seconds): return '%01d:%02d:%02d.%02d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 100) - def _get_subtitles(self, sub_path, video_id): - if not sub_path: + def _get_subtitles(self, sub_url, video_id): + if not sub_url: return None enc_subtitles = self._download_webpage( - urljoin(self._BASE_URL, sub_path), - video_id, 'Downloading subtitles location', fatal=False) or '{}' + sub_url, video_id, 'Downloading subtitles location', fatal=False) or '{}' subtitle_location = (self._parse_json(enc_subtitles, video_id, fatal=False) or {}).get('location') if subtitle_location: enc_subtitles = self._download_webpage( - urljoin(self._BASE_URL, subtitle_location), - video_id, 'Downloading subtitles data', fatal=False, - headers={'Origin': 'https://animedigitalnetwork.fr'}) + subtitle_location, video_id, 'Downloading subtitles data', + fatal=False, headers={'Origin': 'https://animedigitalnetwork.fr'}) if not enc_subtitles: return None # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( bytes_to_intlist(compat_b64decode(enc_subtitles[24:])), - bytes_to_intlist(binascii.unhexlify(self._K + '4b8ef13ec1872730')), + bytes_to_intlist(binascii.unhexlify(self._K + 'ab9f52f5baae7c72')), bytes_to_intlist(compat_b64decode(enc_subtitles[:24])) )) subtitles_json = self._parse_json( @@ -119,59 +131,76 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - player_config = self._parse_json(self._search_regex( - r'playerConfig\s*=\s*({.+});', webpage, - 'player config', default='{}'), video_id, fatal=False) - if not player_config: - config_url = urljoin(self._BASE_URL, self._search_regex( - r'(?:id="player"|class="[^"]*adn-player-container[^"]*")[^>]+data-url="([^"]+)"', - webpage, 'config url')) - player_config = self._download_json( - config_url, video_id, - 'Downloading player config JSON metadata')['player'] + video_base_url = self._PLAYER_BASE_URL + 'video/%s/' % video_id + player = self._download_json( + video_base_url + 'configuration', video_id, + 'Downloading player config JSON metadata')['player'] + options = player['options'] - video_info = {} - video_info_str = self._search_regex( - r'videoInfo\s*=\s*({.+});', webpage, - 'video info', fatal=False) - if video_info_str: - video_info = self._parse_json( - video_info_str, video_id, fatal=False) or {} + user = options['user'] + if not user.get('hasAccess'): + raise ExtractorError( + 'This video is only available for paying users', expected=True) + # self.raise_login_required() # FIXME: Login is not implemented - options = player_config.get('options') or {} - metas = options.get('metas') or {} - links = player_config.get('links') or {} - sub_path = player_config.get('subtitles') - error = None - if not links: - links_url = player_config.get('linksurl') or options['videoUrl'] - token = options['token'] - self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)]) - message = bytes_to_intlist(json.dumps({ - 'k': self._K, - 'e': 60, - 't': token, - })) + token = self._download_json( + user.get('refreshTokenUrl') or (self._PLAYER_BASE_URL + 'refresh/token'), + video_id, 'Downloading access token', headers={ + 'x-player-refresh-token': user['refreshToken'] + }, data=b'')['token'] + + links_url = try_get(options, lambda x: x['video']['url']) or (video_base_url + 'link') + self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)]) + message = bytes_to_intlist(json.dumps({ + 'k': self._K, + 't': token, + })) + + # Sometimes authentication fails for no good reason, retry with + # a different random padding + links_data = None + for _ in range(3): padded_message = intlist_to_bytes(pkcs1pad(message, 128)) n, e = self._RSA_KEY encrypted_message = long_to_bytes(pow(bytes_to_long(padded_message), e, n)) authorization = base64.b64encode(encrypted_message).decode() - links_data = self._download_json( - urljoin(self._BASE_URL, links_url), video_id, - 'Downloading links JSON metadata', headers={ - 'Authorization': 'Bearer ' + authorization, - }) - links = links_data.get('links') or {} - metas = metas or links_data.get('meta') or {} - sub_path = sub_path or links_data.get('subtitles') or \ - 'index.php?option=com_vodapi&task=subtitles.getJSON&format=json&id=' + video_id - sub_path += '&token=' + token - error = links_data.get('error') - title = metas.get('title') or video_info['title'] + + try: + links_data = self._download_json( + links_url, video_id, 'Downloading links JSON metadata', headers={ + 'X-Player-Token': authorization + }, query={ + 'freeWithAds': 'true', + 'adaptive': 'false', + 'withMetadata': 'true', + 'source': 'Web' + }) + break + except ExtractorError as e: + if not isinstance(e.cause, compat_HTTPError): + raise e + + if e.cause.code == 401: + # This usually goes away with a different random pkcs1pad, so retry + continue + + error = self._parse_json(e.cause.read(), video_id) + message = error.get('message') + if e.cause.code == 403 and error.get('code') == 'player-bad-geolocation-country': + self.raise_geo_restricted(msg=message) + else: + raise ExtractorError(message) + else: + raise ExtractorError('Giving up retrying') + + links = links_data.get('links') or {} + metas = links_data.get('metadata') or {} + sub_url = (links.get('subtitles') or {}).get('all') + video_info = links_data.get('video') or {} + title = metas['title'] formats = [] - for format_id, qualities in links.items(): + for format_id, qualities in (links.get('streaming') or {}).items(): if not isinstance(qualities, dict): continue for quality, load_balancer_url in qualities.items(): @@ -189,19 +218,26 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' for f in m3u8_formats: f['language'] = 'fr' formats.extend(m3u8_formats) - if not error: - error = options.get('error') - if not formats and error: - raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) self._sort_formats(formats) + video = (self._download_json( + self._API_BASE_URL + 'video/%s' % video_id, video_id, + 'Downloading additional video metadata', fatal=False) or {}).get('video') or {} + show = video.get('show') or {} + return { 'id': video_id, 'title': title, - 'description': strip_or_none(metas.get('summary') or video_info.get('resume')), - 'thumbnail': video_info.get('image'), + 'description': strip_or_none(metas.get('summary') or video.get('summary')), + 'thumbnail': video_info.get('image') or player.get('image'), 'formats': formats, - 'subtitles': self.extract_subtitles(sub_path, video_id), - 'episode': metas.get('subtitle') or video_info.get('videoTitle'), - 'series': video_info.get('playlistTitle'), + 'subtitles': self.extract_subtitles(sub_url, video_id), + 'episode': metas.get('subtitle') or video.get('name'), + 'episode_number': int_or_none(video.get('shortNumber')), + 'series': show.get('title'), + 'season_number': int_or_none(video.get('season')), + 'duration': int_or_none(video_info.get('duration') or video.get('duration')), + 'release_date': unified_strdate(video.get('releaseDate')), + 'average_rating': float_or_none(video.get('rating') or metas.get('rating')), + 'comment_count': int_or_none(video.get('commentsCount')), } diff --git a/youtube_dlc/extractor/animeondemand.py b/youtube_dlc/extractor/animeondemand.py index 00ce684d1c..54e097d2f7 100644 --- a/youtube_dlc/extractor/animeondemand.py +++ b/youtube_dlc/extractor/animeondemand.py @@ -116,8 +116,6 @@ class AnimeOnDemandIE(InfoExtractor): r'(?s)]+itemprop="description"[^>]*>(.+?)', webpage, 'anime description', default=None) - entries = [] - def extract_info(html, video_id, num=None): title, description = [None] * 2 formats = [] @@ -233,7 +231,7 @@ class AnimeOnDemandIE(InfoExtractor): self._sort_formats(info['formats']) f = common_info.copy() f.update(info) - entries.append(f) + yield f # Extract teaser/trailer only when full episode is not available if not info['formats']: @@ -247,7 +245,7 @@ class AnimeOnDemandIE(InfoExtractor): 'title': m.group('title'), 'url': urljoin(url, m.group('href')), }) - entries.append(f) + yield f def extract_episodes(html): for num, episode_html in enumerate(re.findall( @@ -275,7 +273,8 @@ class AnimeOnDemandIE(InfoExtractor): 'episode_number': episode_number, } - extract_entries(episode_html, video_id, common_info) + for e in extract_entries(episode_html, video_id, common_info): + yield e def extract_film(html, video_id): common_info = { @@ -283,11 +282,18 @@ class AnimeOnDemandIE(InfoExtractor): 'title': anime_title, 'description': anime_description, } - extract_entries(html, video_id, common_info) + for e in extract_entries(html, video_id, common_info): + yield e - extract_episodes(webpage) + def entries(): + has_episodes = False + for e in extract_episodes(webpage): + has_episodes = True + yield e - if not entries: - extract_film(webpage, anime_id) + if not has_episodes: + for e in extract_film(webpage, anime_id): + yield e - return self.playlist_result(entries, anime_id, anime_title, anime_description) + return self.playlist_result( + entries(), anime_id, anime_title, anime_description) diff --git a/youtube_dlc/extractor/cspan.py b/youtube_dlc/extractor/cspan.py index 766942146f..2e01aff488 100644 --- a/youtube_dlc/extractor/cspan.py +++ b/youtube_dlc/extractor/cspan.py @@ -8,11 +8,14 @@ from ..utils import ( ExtractorError, extract_attributes, find_xpath_attr, + get_element_by_attribute, get_element_by_class, int_or_none, js_to_json, merge_dicts, + parse_iso8601, smuggle_url, + str_to_int, unescapeHTML, ) from .senateisvp import SenateISVPIE @@ -116,8 +119,30 @@ class CSpanIE(InfoExtractor): jwsetup, video_id, require_title=False, m3u8_id='hls', base_url=url) add_referer(info['formats']) + for subtitles in info['subtitles'].values(): + for subtitle in subtitles: + ext = determine_ext(subtitle['url']) + if ext == 'php': + ext = 'vtt' + subtitle['ext'] = ext ld_info = self._search_json_ld(webpage, video_id, default={}) - return merge_dicts(info, ld_info) + title = get_element_by_class('video-page-title', webpage) or \ + self._og_search_title(webpage) + description = get_element_by_attribute('itemprop', 'description', webpage) or \ + self._html_search_meta(['og:description', 'description'], webpage) + return merge_dicts(info, ld_info, { + 'title': title, + 'thumbnail': get_element_by_attribute('itemprop', 'thumbnailUrl', webpage), + 'description': description, + 'timestamp': parse_iso8601(get_element_by_attribute('itemprop', 'uploadDate', webpage)), + 'location': get_element_by_attribute('itemprop', 'contentLocation', webpage), + 'duration': int_or_none(self._search_regex( + r'jwsetup\.seclength\s*=\s*(\d+);', + webpage, 'duration', fatal=False)), + 'view_count': str_to_int(self._search_regex( + r"]+class='views'[^>]*>([\d,]+)\s+Views", + webpage, 'views', fatal=False)), + }) # Obsolete # We first look for clipid, because clipprog always appears before diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index 08d19017fb..8b322466bc 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -551,7 +551,10 @@ from .karaoketv import KaraoketvIE from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE from .ketnet import KetnetIE -from .khanacademy import KhanAcademyIE +from .khanacademy import ( + KhanAcademyIE, + KhanAcademyUnitIE, +) from .kickstarter import KickStarterIE from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE diff --git a/youtube_dlc/extractor/khanacademy.py b/youtube_dlc/extractor/khanacademy.py index 61739efa7a..87e520378b 100644 --- a/youtube_dlc/extractor/khanacademy.py +++ b/youtube_dlc/extractor/khanacademy.py @@ -1,82 +1,107 @@ from __future__ import unicode_literals -import re +import json from .common import InfoExtractor from ..utils import ( - unified_strdate, + int_or_none, + parse_iso8601, + try_get, ) -class KhanAcademyIE(InfoExtractor): - _VALID_URL = r'^https?://(?:(?:www|api)\.)?khanacademy\.org/(?P[^/]+)/(?:[^/]+/){,2}(?P[^?#/]+)(?:$|[?#])' - IE_NAME = 'KhanAcademy' +class KhanAcademyBaseIE(InfoExtractor): + _VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P(?:[^/]+/){%s}%s[^?#/&]+)' - _TESTS = [{ - 'url': 'http://www.khanacademy.org/video/one-time-pad', - 'md5': '7b391cce85e758fb94f763ddc1bbb979', + def _parse_video(self, video): + return { + '_type': 'url_transparent', + 'url': video['youtubeId'], + 'id': video.get('slug'), + 'title': video.get('title'), + 'thumbnail': video.get('imageUrl') or video.get('thumbnailUrl'), + 'duration': int_or_none(video.get('duration')), + 'description': video.get('description'), + 'ie_key': 'Youtube', + } + + def _real_extract(self, url): + display_id = self._match_id(url) + component_props = self._parse_json(self._download_json( + 'https://www.khanacademy.org/api/internal/graphql', + display_id, query={ + 'hash': 1604303425, + 'variables': json.dumps({ + 'path': display_id, + 'queryParams': '', + }), + })['data']['contentJson'], display_id)['componentProps'] + return self._parse_component_props(component_props) + + +class KhanAcademyIE(KhanAcademyBaseIE): + IE_NAME = 'khanacademy' + _VALID_URL = KhanAcademyBaseIE._VALID_URL_TEMPL % ('4', 'v/') + _TEST = { + 'url': 'https://www.khanacademy.org/computing/computer-science/cryptography/crypt/v/one-time-pad', + 'md5': '9c84b7b06f9ebb80d22a5c8dedefb9a0', 'info_dict': { - 'id': 'one-time-pad', - 'ext': 'webm', + 'id': 'FlIG3TvQCBQ', + 'ext': 'mp4', 'title': 'The one-time pad', 'description': 'The perfect cipher', 'duration': 176, 'uploader': 'Brit Cruise', 'uploader_id': 'khanacademy', 'upload_date': '20120411', + 'timestamp': 1334170113, + 'license': 'cc-by-nc-sa', }, 'add_ie': ['Youtube'], - }, { - 'url': 'https://www.khanacademy.org/math/applied-math/cryptography', + } + + def _parse_component_props(self, component_props): + video = component_props['tutorialPageData']['contentModel'] + info = self._parse_video(video) + author_names = video.get('authorNames') + info.update({ + 'uploader': ', '.join(author_names) if author_names else None, + 'timestamp': parse_iso8601(video.get('dateAdded')), + 'license': video.get('kaUserLicense'), + }) + return info + + +class KhanAcademyUnitIE(KhanAcademyBaseIE): + IE_NAME = 'khanacademy:unit' + _VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('2', '')) + '/?(?:[?#&]|$)' + _TEST = { + 'url': 'https://www.khanacademy.org/computing/computer-science/cryptography', 'info_dict': { 'id': 'cryptography', - 'title': 'Journey into cryptography', + 'title': 'Cryptography', 'description': 'How have humans protected their secret messages through history? What has changed today?', }, - 'playlist_mincount': 3, - }] + 'playlist_mincount': 31, + } - def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('id') + def _parse_component_props(self, component_props): + curation = component_props['curation'] - if m.group('key') == 'video': - data = self._download_json( - 'http://api.khanacademy.org/api/v1/videos/' + video_id, - video_id, 'Downloading video info') - - upload_date = unified_strdate(data['date_added']) - uploader = ', '.join(data['author_names']) - return { - '_type': 'url_transparent', - 'url': data['url'], - 'id': video_id, - 'title': data['title'], - 'thumbnail': data['image_url'], - 'duration': data['duration'], - 'description': data['description'], - 'uploader': uploader, - 'upload_date': upload_date, + entries = [] + tutorials = try_get(curation, lambda x: x['tabs'][0]['modules'][0]['tutorials'], list) or [] + for tutorial_number, tutorial in enumerate(tutorials, 1): + chapter_info = { + 'chapter': tutorial.get('title'), + 'chapter_number': tutorial_number, + 'chapter_id': tutorial.get('id'), } - else: - # topic - data = self._download_json( - 'http://api.khanacademy.org/api/v1/topic/' + video_id, - video_id, 'Downloading topic info') + for content_item in (tutorial.get('contentItems') or []): + if content_item.get('kind') == 'Video': + info = self._parse_video(content_item) + info.update(chapter_info) + entries.append(info) - entries = [ - { - '_type': 'url', - 'url': c['url'], - 'id': c['id'], - 'title': c['title'], - } - for c in data['children'] if c['kind'] in ('Video', 'Topic')] - - return { - '_type': 'playlist', - 'id': video_id, - 'title': data['title'], - 'description': data['description'], - 'entries': entries, - } + return self.playlist_result( + entries, curation.get('unit'), curation.get('title'), + curation.get('description')) diff --git a/youtube_dlc/extractor/mixcloud.py b/youtube_dlc/extractor/mixcloud.py index 9759560f1b..69319857df 100644 --- a/youtube_dlc/extractor/mixcloud.py +++ b/youtube_dlc/extractor/mixcloud.py @@ -251,8 +251,11 @@ class MixcloudPlaylistBaseIE(MixcloudBaseIE): cloudcast_url = cloudcast.get('url') if not cloudcast_url: continue + slug = try_get(cloudcast, lambda x: x['slug'], compat_str) + owner_username = try_get(cloudcast, lambda x: x['owner']['username'], compat_str) + video_id = '%s_%s' % (owner_username, slug) if slug and owner_username else None entries.append(self.url_result( - cloudcast_url, MixcloudIE.ie_key(), cloudcast.get('slug'))) + cloudcast_url, MixcloudIE.ie_key(), video_id)) page_info = items['pageInfo'] has_next_page = page_info['hasNextPage'] @@ -321,7 +324,8 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): _DESCRIPTION_KEY = 'biog' _ROOT_TYPE = 'user' _NODE_TEMPLATE = '''slug - url''' + url + owner { username }''' def _get_playlist_title(self, title, slug): return '%s (%s)' % (title, slug) @@ -345,6 +349,7 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE): _NODE_TEMPLATE = '''cloudcast { slug url + owner { username } }''' def _get_cloudcast(self, node): diff --git a/youtube_dlc/extractor/peertube.py b/youtube_dlc/extractor/peertube.py index c39d12728d..c2ca71c71d 100644 --- a/youtube_dlc/extractor/peertube.py +++ b/youtube_dlc/extractor/peertube.py @@ -450,6 +450,18 @@ class PeerTubeIE(InfoExtractor): 'tags': ['framasoft', 'peertube'], 'categories': ['Science & Technology'], } + }, { + # Issue #26002 + 'url': 'peertube:spacepub.space:d8943b2d-8280-497b-85ec-bc282ec2afdc', + 'info_dict': { + 'id': 'd8943b2d-8280-497b-85ec-bc282ec2afdc', + 'ext': 'mp4', + 'title': 'Dot matrix printer shell demo', + 'uploader_id': '3', + 'timestamp': 1587401293, + 'upload_date': '20200420', + 'uploader': 'Drew DeVault', + } }, { 'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', 'only_matching': True, @@ -526,7 +538,15 @@ class PeerTubeIE(InfoExtractor): title = video['name'] formats = [] - for file_ in video['files']: + files = video.get('files') or [] + for playlist in (video.get('streamingPlaylists') or []): + if not isinstance(playlist, dict): + continue + playlist_files = playlist.get('files') + if not (playlist_files and isinstance(playlist_files, list)): + continue + files.extend(playlist_files) + for file_ in files: if not isinstance(file_, dict): continue file_url = url_or_none(file_.get('fileUrl')) diff --git a/youtube_dlc/extractor/spike.py b/youtube_dlc/extractor/spike.py index 3cee331f6a..4180e71efa 100644 --- a/youtube_dlc/extractor/spike.py +++ b/youtube_dlc/extractor/spike.py @@ -50,9 +50,15 @@ class ParamountNetworkIE(MTVServicesInfoExtractor): }, }] - _FEED_URL = 'http://www.paramountnetwork.com/feeds/mrss/' + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' _GEO_COUNTRIES = ['US'] + def _get_feed_query(self, uri): + return { + 'arcEp': 'paramountnetwork.com', + 'mgid': uri, + } + def _extract_mgid(self, webpage, url): root_data = self._parse_json(self._search_regex( r'window\.__DATA__\s*=\s*({.+})', diff --git a/youtube_dlc/extractor/threeqsdn.py b/youtube_dlc/extractor/threeqsdn.py index f26937da1e..f6d37bb9e0 100644 --- a/youtube_dlc/extractor/threeqsdn.py +++ b/youtube_dlc/extractor/threeqsdn.py @@ -3,10 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( determine_ext, - js_to_json, - mimetype2ext, + ExtractorError, + float_or_none, + int_or_none, + parse_iso8601, ) @@ -15,29 +18,35 @@ class ThreeQSDNIE(InfoExtractor): IE_DESC = '3Q SDN' _VALID_URL = r'https?://playout\.3qsdn\.com/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _TESTS = [{ - # ondemand from http://www.philharmonie.tv/veranstaltung/26/ - 'url': 'http://playout.3qsdn.com/0280d6b9-1215-11e6-b427-0cc47a188158?protocol=http', - 'md5': 'ab040e37bcfa2e0c079f92cb1dd7f6cd', + # https://player.3qsdn.com/demo.html + 'url': 'https://playout.3qsdn.com/7201c779-6b3c-11e7-a40e-002590c750be', + 'md5': '64a57396b16fa011b15e0ea60edce918', 'info_dict': { - 'id': '0280d6b9-1215-11e6-b427-0cc47a188158', + 'id': '7201c779-6b3c-11e7-a40e-002590c750be', 'ext': 'mp4', - 'title': '0280d6b9-1215-11e6-b427-0cc47a188158', + 'title': 'Video Ads', 'is_live': False, + 'description': 'Video Ads Demo', + 'timestamp': 1500334803, + 'upload_date': '20170717', + 'duration': 888.032, + 'subtitles': { + 'eng': 'count:1', + }, }, - 'expected_warnings': ['Failed to download MPD manifest', 'Failed to parse JSON'], + 'expected_warnings': ['Unknown MIME type application/mp4 in DASH manifest'], }, { # live video stream - 'url': 'https://playout.3qsdn.com/d755d94b-4ab9-11e3-9162-0025907ad44f?js=true', + 'url': 'https://playout.3qsdn.com/66e68995-11ca-11e8-9273-002590c750be', 'info_dict': { - 'id': 'd755d94b-4ab9-11e3-9162-0025907ad44f', + 'id': '66e68995-11ca-11e8-9273-002590c750be', 'ext': 'mp4', - 'title': 're:^d755d94b-4ab9-11e3-9162-0025907ad44f [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^66e68995-11ca-11e8-9273-002590c750be [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, }, 'params': { 'skip_download': True, # m3u8 downloads }, - 'expected_warnings': ['Failed to download MPD manifest'], }, { # live audio stream 'url': 'http://playout.3qsdn.com/9edf36e0-6bf2-11e2-a16a-9acf09e2db48', @@ -58,6 +67,14 @@ class ThreeQSDNIE(InfoExtractor): # live video with rtmp link 'url': 'https://playout.3qsdn.com/6092bb9e-8f72-11e4-a173-002590c750be', 'only_matching': True, + }, { + # ondemand from http://www.philharmonie.tv/veranstaltung/26/ + 'url': 'http://playout.3qsdn.com/0280d6b9-1215-11e6-b427-0cc47a188158?protocol=http', + 'only_matching': True, + }, { + # live video stream + 'url': 'https://playout.3qsdn.com/d755d94b-4ab9-11e3-9162-0025907ad44f?js=true', + 'only_matching': True, }] @staticmethod @@ -70,73 +87,78 @@ class ThreeQSDNIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - js = self._download_webpage( - 'http://playout.3qsdn.com/%s' % video_id, video_id, - query={'js': 'true'}) + try: + config = self._download_json( + url.replace('://playout.3qsdn.com/', '://playout.3qsdn.com/config/'), video_id) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + self.raise_geo_restricted() + raise - if any(p in js for p in ( - '>This content is not available in your country', - 'playout.3qsdn.com/forbidden')): - self.raise_geo_restricted() - - stream_content = self._search_regex( - r'streamContent\s*:\s*(["\'])(?P.+?)\1', js, - 'stream content', default='demand', group='content') - - live = stream_content == 'live' - - stream_type = self._search_regex( - r'streamType\s*:\s*(["\'])(?Paudio|video)\1', js, - 'stream type', default='video', group='type') + live = config.get('streamContent') == 'live' + aspect = float_or_none(config.get('aspect')) formats = [] - urls = set() - - def extract_formats(item_url, item={}): - if not item_url or item_url in urls: - return - urls.add(item_url) - ext = mimetype2ext(item.get('type')) or determine_ext(item_url, default_ext=None) - if ext == 'mpd': - formats.extend(self._extract_mpd_formats( - item_url, video_id, mpd_id='mpd', fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - item_url, video_id, 'mp4', - entry_protocol='m3u8' if live else 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - item_url, video_id, f4m_id='hds', fatal=False)) - else: - if not self._is_valid_url(item_url, video_id): - return - formats.append({ - 'url': item_url, - 'format_id': item.get('quality'), - 'ext': 'mp4' if item_url.startswith('rtsp') else ext, - 'vcodec': 'none' if stream_type == 'audio' else None, - }) - - for item_js in re.findall(r'({[^{]*?\b(?:src|source)\s*:\s*["\'].+?})', js): - f = self._parse_json( - item_js, video_id, transform_source=js_to_json, fatal=False) - if not f: + for source_type, source in (config.get('sources') or {}).items(): + if not source: continue - extract_formats(f.get('src'), f) + if source_type == 'dash': + formats.extend(self._extract_mpd_formats( + source, video_id, mpd_id='mpd', fatal=False)) + elif source_type == 'hls': + formats.extend(self._extract_m3u8_formats( + source, video_id, 'mp4', 'm3u8' if live else 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif source_type == 'progressive': + for s in source: + src = s.get('src') + if not (src and self._is_valid_url(src, video_id)): + continue + width = None + format_id = ['http'] + ext = determine_ext(src) + if ext: + format_id.append(ext) + height = int_or_none(s.get('height')) + if height: + format_id.append('%dp' % height) + if aspect: + width = int(height * aspect) + formats.append({ + 'ext': ext, + 'format_id': '-'.join(format_id), + 'height': height, + 'source_preference': 0, + 'url': src, + 'vcodec': 'none' if height == 0 else None, + 'width': width, + }) + for f in formats: + if f.get('acodec') == 'none': + f['preference'] = -40 + elif f.get('vcodec') == 'none': + f['preference'] = -50 + self._sort_formats(formats, ('preference', 'width', 'height', 'source_preference', 'tbr', 'vbr', 'abr', 'ext', 'format_id')) - # More relaxed version to collect additional URLs and acting - # as a future-proof fallback - for _, src in re.findall(r'\b(?:src|source)\s*:\s*(["\'])((?:https?|rtsp)://.+?)\1', js): - extract_formats(src) + subtitles = {} + for subtitle in (config.get('subtitles') or []): + src = subtitle.get('src') + if not src: + continue + subtitles.setdefault(subtitle.get('label') or 'eng', []).append({ + 'url': src, + }) - self._sort_formats(formats) - - title = self._live_title(video_id) if live else video_id + title = config.get('title') or video_id return { 'id': video_id, - 'title': title, + 'title': self._live_title(title) if live else title, + 'thumbnail': config.get('poster') or None, + 'description': config.get('description') or None, + 'timestamp': parse_iso8601(config.get('upload_date')), + 'duration': float_or_none(config.get('vlength')) or None, 'is_live': live, 'formats': formats, + 'subtitles': subtitles, } diff --git a/youtube_dlc/extractor/twitch.py b/youtube_dlc/extractor/twitch.py index 503d019de1..fc8cb73217 100644 --- a/youtube_dlc/extractor/twitch.py +++ b/youtube_dlc/extractor/twitch.py @@ -17,6 +17,7 @@ from ..compat import ( ) from ..utils import ( clean_html, + dict_get, ExtractorError, float_or_none, int_or_none, @@ -76,14 +77,14 @@ class TwitchBaseIE(InfoExtractor): headers = { 'Referer': page_url, - 'Origin': page_url, + 'Origin': 'https://www.twitch.tv', 'Content-Type': 'text/plain;charset=UTF-8', } response = self._download_json( post_url, None, note, data=json.dumps(form).encode(), headers=headers, expected_status=400) - error = response.get('error_description') or response.get('error_code') + error = dict_get(response, ('error', 'error_description', 'error_code')) if error: fail(error) @@ -137,13 +138,17 @@ class TwitchBaseIE(InfoExtractor): self._sort_formats(formats) def _download_base_gql(self, video_id, ops, note, fatal=True): + headers = { + 'Content-Type': 'text/plain;charset=UTF-8', + 'Client-ID': self._CLIENT_ID, + } + gql_auth = self._get_cookies('https://gql.twitch.tv').get('auth-token') + if gql_auth: + headers['Authorization'] = 'OAuth ' + gql_auth.value return self._download_json( 'https://gql.twitch.tv/gql', video_id, note, data=json.dumps(ops).encode(), - headers={ - 'Content-Type': 'text/plain;charset=UTF-8', - 'Client-ID': self._CLIENT_ID, - }, fatal=fatal) + headers=headers, fatal=fatal) def _download_gql(self, video_id, ops, note, fatal=True): for op in ops: diff --git a/youtube_dlc/extractor/twitter.py b/youtube_dlc/extractor/twitter.py index 4602c09841..8a2a77b710 100644 --- a/youtube_dlc/extractor/twitter.py +++ b/youtube_dlc/extractor/twitter.py @@ -373,6 +373,24 @@ class TwitterIE(TwitterBaseIE): 'uploader_id': '1eVjYOLGkGrQL', }, 'add_ie': ['TwitterBroadcast'], + }, { + # unified card + 'url': 'https://twitter.com/BrooklynNets/status/1349794411333394432?s=20', + 'info_dict': { + 'id': '1349794411333394432', + 'ext': 'mp4', + 'title': 'md5:d1c4941658e4caaa6cb579260d85dcba', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:71ead15ec44cee55071547d6447c6a3e', + 'uploader': 'Brooklyn Nets', + 'uploader_id': 'BrooklynNets', + 'duration': 324.484, + 'timestamp': 1610651040, + 'upload_date': '20210114', + }, + 'params': { + 'skip_download': True, + }, }, { # Twitch Clip Embed 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', @@ -389,6 +407,22 @@ class TwitterIE(TwitterBaseIE): # appplayer card 'url': 'https://twitter.com/poco_dandy/status/1150646424461176832', 'only_matching': True, + }, { + # video_direct_message card + 'url': 'https://twitter.com/qarev001/status/1348948114569269251', + 'only_matching': True, + }, { + # poll2choice_video card + 'url': 'https://twitter.com/CAF_Online/status/1349365911120195585', + 'only_matching': True, + }, { + # poll3choice_video card + 'url': 'https://twitter.com/SamsungMobileSA/status/1348609186725289984', + 'only_matching': True, + }, { + # poll4choice_video card + 'url': 'https://twitter.com/SouthamptonFC/status/1347577658079641604', + 'only_matching': True, }] def _real_extract(self, url): @@ -433,8 +467,7 @@ class TwitterIE(TwitterBaseIE): 'tags': tags, } - media = try_get(status, lambda x: x['extended_entities']['media'][0]) - if media and media.get('type') != 'photo': + def extract_from_video_info(media): video_info = media.get('video_info') or {} formats = [] @@ -461,6 +494,10 @@ class TwitterIE(TwitterBaseIE): 'thumbnails': thumbnails, 'duration': float_or_none(video_info.get('duration_millis'), 1000), }) + + media = try_get(status, lambda x: x['extended_entities']['media'][0]) + if media and media.get('type') != 'photo': + extract_from_video_info(media) else: card = status.get('card') if card: @@ -493,7 +530,12 @@ class TwitterIE(TwitterBaseIE): '_type': 'url', 'url': get_binding_value('card_url'), }) - # amplify, promo_video_website, promo_video_convo, appplayer, ... + elif card_name == 'unified_card': + media_entities = self._parse_json(get_binding_value('unified_card'), twid)['media_entities'] + extract_from_video_info(next(iter(media_entities.values()))) + # amplify, promo_video_website, promo_video_convo, appplayer, + # video_direct_message, poll2choice_video, poll3choice_video, + # poll4choice_video, ... else: is_amplify = card_name == 'amplify' vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url') diff --git a/youtube_dlc/extractor/youporn.py b/youtube_dlc/extractor/youporn.py index 7b9feafeb2..534270bac3 100644 --- a/youtube_dlc/extractor/youporn.py +++ b/youtube_dlc/extractor/youporn.py @@ -60,6 +60,9 @@ class YouPornIE(InfoExtractor): }, { 'url': 'http://www.youporn.com/watch/505835', 'only_matching': True, + }, { + 'url': 'https://www.youporn.com/watch/13922959/femdom-principal/', + 'only_matching': True, }] @staticmethod @@ -88,7 +91,7 @@ class YouPornIE(InfoExtractor): # Main source definitions = self._parse_json( self._search_regex( - r'mediaDefinition\s*=\s*(\[.+?\]);', webpage, + r'mediaDefinition\s*[=:]\s*(\[.+?\])\s*[;,]', webpage, 'media definitions', default='[]'), video_id, fatal=False) if definitions: @@ -100,7 +103,7 @@ class YouPornIE(InfoExtractor): links.append(video_url) # Fallback #1, this also contains extra low quality 180p format - for _, link in re.findall(r']+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage): + for _, link in re.findall(r']+href=(["\'])(http(?:(?!\1).)+\.mp4(?:(?!\1).)*)\1[^>]+title=["\']Download [Vv]ideo', webpage): links.append(link) # Fallback #2 (unavailable as at 22.06.2017) @@ -128,8 +131,9 @@ class YouPornIE(InfoExtractor): # Video URL's path looks like this: # /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 # /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 + # /videos/201703/11/109285532/1080P_4000K_109285532.mp4 # We will benefit from it by extracting some metadata - mobj = re.search(r'(?P\d{3,4})[pP]_(?P\d+)[kK]_\d+/', video_url) + mobj = re.search(r'(?P\d{3,4})[pP]_(?P\d+)[kK]_\d+', video_url) if mobj: height = int(mobj.group('height')) bitrate = int(mobj.group('bitrate')) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 0b87f2185e..20657bb196 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -332,6 +332,36 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg', default='{}'), video_id, fatal=False) + def _extract_video(self, renderer): + video_id = renderer.get('videoId') + title = try_get( + renderer, + (lambda x: x['title']['runs'][0]['text'], + lambda x: x['title']['simpleText']), compat_str) + description = try_get( + renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'], + compat_str) + duration = parse_duration(try_get( + renderer, lambda x: x['lengthText']['simpleText'], compat_str)) + view_count_text = try_get( + renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or '' + view_count = str_to_int(self._search_regex( + r'^([\d,]+)', re.sub(r'\s', '', view_count_text), + 'view count', default=None)) + uploader = try_get( + renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str) + return { + '_type': 'url_transparent', + 'ie_key': YoutubeIE.ie_key(), + 'id': video_id, + 'url': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'uploader': uploader, + } + class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com' @@ -2871,36 +2901,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if renderer: return renderer - def _extract_video(self, renderer): - video_id = renderer.get('videoId') - title = try_get( - renderer, - (lambda x: x['title']['runs'][0]['text'], - lambda x: x['title']['simpleText']), compat_str) - description = try_get( - renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'], - compat_str) - duration = parse_duration(try_get( - renderer, lambda x: x['lengthText']['simpleText'], compat_str)) - view_count_text = try_get( - renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or '' - view_count = str_to_int(self._search_regex( - r'^([\d,]+)', re.sub(r'\s', '', view_count_text), - 'view count', default=None)) - uploader = try_get( - renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str) - return { - '_type': 'url_transparent', - 'ie_key': YoutubeIE.ie_key(), - 'id': video_id, - 'url': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'view_count': view_count, - 'uploader': uploader, - } - def _grid_entries(self, grid_renderer): for item in grid_renderer['items']: if not isinstance(item, dict): @@ -3583,65 +3583,38 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor): if not slr_contents: break - isr_contents = [] - continuation_token = None # Youtube sometimes adds promoted content to searches, # changing the index location of videos and token. # So we search through all entries till we find them. - for index, isr in enumerate(slr_contents): + continuation_token = None + for slr_content in slr_contents: + isr_contents = try_get( + slr_content, + lambda x: x['itemSectionRenderer']['contents'], + list) if not isr_contents: - isr_contents = try_get( - slr_contents, - (lambda x: x[index]['itemSectionRenderer']['contents']), - list) - for content in isr_contents: - if content.get('videoRenderer') is not None: - break - else: - isr_contents = [] + continue + for content in isr_contents: + if not isinstance(content, dict): + continue + video = content.get('videoRenderer') + if not isinstance(video, dict): + continue + video_id = video.get('videoId') + if not video_id: + continue + + yield self._extract_video(video) + total += 1 + if total == n: + return if continuation_token is None: continuation_token = try_get( - slr_contents, - lambda x: x[index]['continuationItemRenderer']['continuationEndpoint']['continuationCommand'][ - 'token'], + slr_content, + lambda x: x['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'], compat_str) - if continuation_token is not None and isr_contents: - break - if not isr_contents: - break - for content in isr_contents: - if not isinstance(content, dict): - continue - video = content.get('videoRenderer') - if not isinstance(video, dict): - continue - video_id = video.get('videoId') - if not video_id: - continue - title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str) - description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str) - duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str)) - view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or '' - view_count = str_to_int(self._search_regex( - r'^([\d,]+)', re.sub(r'\s', '', view_count_text), - 'view count', default=None)) - uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str) - total += 1 - yield { - '_type': 'url_transparent', - 'ie_key': YoutubeIE.ie_key(), - 'id': video_id, - 'url': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'view_count': view_count, - 'uploader': uploader, - } - if total == n: - return if not continuation_token: break data['continuation'] = continuation_token