From c15de6ffe6a36a31ea59afe11df5a77c2544d414 Mon Sep 17 00:00:00 2001 From: Lauren Liberda Date: Sun, 31 Oct 2021 11:01:47 +0530 Subject: [PATCH] [tvp] Fix extractor (#1401) Authored by: selfisekai --- yt_dlp/extractor/tvp.py | 211 +++++++++++++++++++++++++++++++++++----- 1 file changed, 189 insertions(+), 22 deletions(-) diff --git a/yt_dlp/extractor/tvp.py b/yt_dlp/extractor/tvp.py index 87d455e6db..22cfbd25e0 100644 --- a/yt_dlp/extractor/tvp.py +++ b/yt_dlp/extractor/tvp.py @@ -2,35 +2,40 @@ from __future__ import unicode_literals import itertools +import random import re from .common import InfoExtractor from ..utils import ( - clean_html, determine_ext, + dict_get, ExtractorError, - get_element_by_attribute, + int_or_none, + js_to_json, orderedSet, + str_or_none, + try_get, ) class TVPIE(InfoExtractor): IE_NAME = 'tvp' IE_DESC = 'Telewizja Polska' - _VALID_URL = r'https?://[^/]+\.tvp\.(?:pl|info)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P\d+)' + _VALID_URL = r'https?://(?:[^/]+\.)?(?:tvp(?:parlament)?\.(?:pl|info)|polandin\.com)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P\d+)' _TESTS = [{ + # TVPlayer 2 in js wrapper 'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536', - 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', 'info_dict': { 'id': '194536', 'ext': 'mp4', 'title': 'Czas honoru, odc. 13 – Władek', 'description': 'md5:437f48b93558370b031740546b696e24', + 'age_limit': 12, }, }, { + # TVPlayer legacy 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', - 'md5': 'b0005b542e5b4de643a9690326ab1257', 'info_dict': { 'id': '17916176', 'ext': 'mp4', @@ -38,16 +43,63 @@ class TVPIE(InfoExtractor): 'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', }, }, { - # page id is not the same as video id(#7799) - 'url': 'https://wiadomosci.tvp.pl/33908820/28092017-1930', - 'md5': '84cd3c8aec4840046e5ab712416b73d0', + # TVPlayer 2 in iframe + 'url': 'https://wiadomosci.tvp.pl/50725617/dzieci-na-sprzedaz-dla-homoseksualistow', 'info_dict': { - 'id': '33908820', + 'id': '50725617', 'ext': 'mp4', - 'title': 'Wiadomości, 28.09.2017, 19:30', - 'description': 'Wydanie główne codziennego serwisu informacyjnego.' + 'title': 'Dzieci na sprzedaż dla homoseksualistów', + 'description': 'md5:7d318eef04e55ddd9f87a8488ac7d590', + 'age_limit': 12, }, - 'skip': 'HTTP Error 404: Not Found', + }, { + # TVPlayer 2 in client-side rendered website (regional; window.__newsData) + 'url': 'https://warszawa.tvp.pl/25804446/studio-yayo', + 'info_dict': { + 'id': '25804446', + 'ext': 'mp4', + 'title': 'Studio Yayo', + 'upload_date': '20160616', + 'timestamp': 1466075700, + } + }, { + # TVPlayer 2 in client-side rendered website (tvp.info; window.__videoData) + 'url': 'https://www.tvp.info/52880236/09042021-0800', + 'info_dict': { + 'id': '52880236', + 'ext': 'mp4', + 'title': '09.04.2021, 08:00', + }, + }, { + # client-side rendered (regional) program (playlist) page + 'url': 'https://opole.tvp.pl/9660819/rozmowa-dnia', + 'info_dict': { + 'id': '9660819', + 'description': 'Od poniedziałku do piątku o 18:55', + 'title': 'Rozmowa dnia', + }, + 'playlist_mincount': 1800, + 'params': { + 'skip_download': True, + } + }, { + # ABC-specific video embeding + # moved to https://bajkowakraina.tvp.pl/wideo/50981130,teleranek,51027049,zubr,51116450 + 'url': 'https://abc.tvp.pl/48636269/zubry-odc-124', + 'info_dict': { + 'id': '48320456', + 'ext': 'mp4', + 'title': 'Teleranek, Żubr', + }, + 'skip': 'unavailable', + }, { + # yet another vue page + 'url': 'https://jp2.tvp.pl/46925618/filmy', + 'info_dict': { + 'id': '46925618', + 'title': 'Filmy', + }, + 'playlist_mincount': 19, }, { 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', 'only_matching': True, @@ -66,21 +118,134 @@ class TVPIE(InfoExtractor): }, { 'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji', 'only_matching': True, + }, { + 'url': 'https://tvp.info/49193823/teczowe-flagi-na-pomnikach-prokuratura-wszczela-postepowanie-wieszwiecej', + 'only_matching': True, + }, { + 'url': 'https://www.tvpparlament.pl/retransmisje-vod/inne/wizyta-premiera-mateusza-morawieckiego-w-firmie-berotu-sp-z-oo/48857277', + 'only_matching': True, + }, { + 'url': 'https://polandin.com/47942651/pln-10-billion-in-subsidies-transferred-to-companies-pm', + 'only_matching': True, }] + def _parse_vue_website_data(self, webpage, page_id): + website_data = self._search_regex([ + # website - regiony, tvp.info + # directory - jp2.tvp.pl + r'window\.__(?:website|directory)Data\s*=\s*({(?:.|\s)+?});', + ], webpage, 'website data') + if not website_data: + return None + return self._parse_json(website_data, page_id, transform_source=js_to_json) + + def _extract_vue_video(self, video_data, page_id=None): + if isinstance(video_data, str): + video_data = self._parse_json(video_data, page_id, transform_source=js_to_json) + thumbnails = [] + image = video_data.get('image') + if image: + for thumb in (image if isinstance(image, list) else [image]): + thmb_url = str_or_none(thumb.get('url')) + if thmb_url: + thumbnails.append({ + 'url': thmb_url, + }) + is_website = video_data.get('type') == 'website' + if is_website: + url = video_data['url'] + fucked_up_url_parts = re.match(r'https?://vod\.tvp\.pl/(\d+)/([^/?#]+)', url) + if fucked_up_url_parts: + url = f'https://vod.tvp.pl/website/{fucked_up_url_parts.group(2)},{fucked_up_url_parts.group(1)}' + else: + url = 'tvp:' + str_or_none(video_data.get('_id') or page_id) + return { + '_type': 'url_transparent', + 'id': str_or_none(video_data.get('_id') or page_id), + 'url': url, + 'ie_key': 'TVPEmbed' if not is_website else 'TVPWebsite', + 'title': str_or_none(video_data.get('title')), + 'description': str_or_none(video_data.get('lead')), + 'timestamp': int_or_none(video_data.get('release_date_long')), + 'duration': int_or_none(video_data.get('duration')), + 'thumbnails': thumbnails, + } + + def _handle_vuejs_page(self, url, webpage, page_id): + # vue client-side rendered sites (all regional pages + tvp.info) + video_data = self._search_regex([ + r'window\.__(?:news|video)Data\s*=\s*({(?:.|\s)+?})\s*;', + ], webpage, 'video data', default=None) + if video_data: + return self._extract_vue_video(video_data, page_id=page_id) + # paged playlists + website_data = self._parse_vue_website_data(webpage, page_id) + if website_data: + entries = self._vuejs_entries(url, website_data, page_id) + + return { + '_type': 'playlist', + 'id': page_id, + 'title': str_or_none(website_data.get('title')), + 'description': str_or_none(website_data.get('lead')), + 'entries': entries, + } + raise ExtractorError('Could not extract video/website data') + + def _vuejs_entries(self, url, website_data, page_id): + + def extract_videos(wd): + if wd.get('latestVideo'): + yield self._extract_vue_video(wd['latestVideo']) + for video in wd.get('videos') or []: + yield self._extract_vue_video(video) + for video in wd.get('items') or []: + yield self._extract_vue_video(video) + + yield from extract_videos(website_data) + + if website_data.get('items_total_count') > website_data.get('items_per_page'): + for page in itertools.count(2): + page_website_data = self._parse_vue_website_data( + self._download_webpage(url, page_id, note='Downloading page #%d' % page, + query={'page': page}), + page_id) + if not page_website_data.get('videos') and not page_website_data.get('items'): + break + yield from extract_videos(page_website_data) + def _real_extract(self, url): page_id = self._match_id(url) - webpage = self._download_webpage(url, page_id) + webpage, urlh = self._download_webpage_handle(url, page_id) + + # The URL may redirect to a VOD + # example: https://vod.tvp.pl/48463890/wadowickie-spotkania-z-janem-pawlem-ii + if TVPWebsiteIE.suitable(urlh.url): + return self.url_result(urlh.url, ie=TVPWebsiteIE.ie_key(), video_id=page_id) + + if re.search( + r'window\.__(?:video|news|website|directory)Data\s*=', + webpage): + return self._handle_vuejs_page(url, webpage, page_id) + + # classic server-side rendered sites video_id = self._search_regex([ + r']+src="[^"]*?embed\.php\?(?:[^&]+&)*ID=(\d+)', r']+src="[^"]*?object_id=(\d+)', r"object_id\s*:\s*'(\d+)'", - r'data-video-id="(\d+)"'], webpage, 'video id', default=page_id) + r'data-video-id="(\d+)"', + + # abc.tvp.pl - somehow there are more than one video IDs that seem to be the same video? + # the first one is referenced to as "copyid", and seems to be unused by the website + r'', + ], webpage, 'video id', default=page_id) return { '_type': 'url_transparent', 'url': 'tvp:' + video_id, 'description': self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'description', webpage, default=None), + webpage, default=None) or (self._html_search_meta( + 'description', webpage, default=None) + if '//s.tvp.pl/files/portal/v' in webpage else None), 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'ie_key': 'TVPEmbed', } @@ -252,18 +417,20 @@ class TVPWebsiteIE(InfoExtractor): _TESTS = [{ # series - 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312/video', + 'url': 'https://vod.tvp.pl/website/wspaniale-stulecie,17069012/video', 'info_dict': { - 'id': '38678312', + 'id': '17069012', }, - 'playlist_count': 115, + 'playlist_count': 312, }, { # film - 'url': 'https://vod.tvp.pl/website/gloria,35139666', + 'url': 'https://vod.tvp.pl/website/krzysztof-krawczyk-cale-moje-zycie,51374466', 'info_dict': { - 'id': '36637049', + 'id': '51374509', 'ext': 'mp4', - 'title': 'Gloria, Gloria', + 'title': 'Krzysztof Krawczyk – całe moje życie, Krzysztof Krawczyk – całe moje życie', + 'description': 'md5:2e80823f00f5fc263555482f76f8fa42', + 'age_limit': 12, }, 'params': { 'skip_download': True,