1
mirror of https://github.com/yt-dlp/yt-dlp synced 2024-12-14 20:59:16 +01:00

[BiliIntl] Fix extractor (#2077)

Closes #1744
Authored by: MinePlayersPE
This commit is contained in:
MinePlayersPE 2021-12-26 05:41:38 +07:00 committed by GitHub
parent 3774f4f427
commit c62ecf0d90
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -19,14 +19,15 @@ from ..utils import (
parse_iso8601, parse_iso8601,
traverse_obj, traverse_obj,
try_get, try_get,
parse_count,
smuggle_url, smuggle_url,
srt_subtitles_timecode, srt_subtitles_timecode,
str_or_none, str_or_none,
str_to_int,
strip_jsonp, strip_jsonp,
unified_timestamp, unified_timestamp,
unsmuggle_url, unsmuggle_url,
urlencode_postdata, urlencode_postdata,
url_or_none,
OnDemandPagedList OnDemandPagedList
) )
@ -722,10 +723,10 @@ class BiliBiliPlayerIE(InfoExtractor):
class BiliIntlBaseIE(InfoExtractor): class BiliIntlBaseIE(InfoExtractor):
_API_URL = 'https://api.bili{}/intl/gateway{}' _API_URL = 'https://api.bilibili.tv/intl/gateway'
def _call_api(self, type, endpoint, id): def _call_api(self, endpoint, *args, **kwargs):
return self._download_json(self._API_URL.format(type, endpoint), id)['data'] return self._download_json(self._API_URL + endpoint, *args, **kwargs)['data']
def json2srt(self, json): def json2srt(self, json):
data = '\n\n'.join( data = '\n\n'.join(
@ -733,29 +734,40 @@ class BiliIntlBaseIE(InfoExtractor):
for i, line in enumerate(json['body'])) for i, line in enumerate(json['body']))
return data return data
def _get_subtitles(self, type, ep_id): def _get_subtitles(self, ep_id):
sub_json = self._call_api(type, f'/m/subtitle?ep_id={ep_id}&platform=web', ep_id) sub_json = self._call_api(f'/web/v2/subtitle?episode_id={ep_id}&platform=web', ep_id)
subtitles = {} subtitles = {}
for sub in sub_json.get('subtitles', []): for sub in sub_json.get('subtitles') or []:
sub_url = sub.get('url') sub_url = sub.get('url')
if not sub_url: if not sub_url:
continue continue
sub_data = self._download_json(sub_url, ep_id, fatal=False) sub_data = self._download_json(
sub_url, ep_id, errnote='Unable to download subtitles', fatal=False,
note='Downloading subtitles%s' % f' for {sub["lang"]}' if sub.get('lang') else '')
if not sub_data: if not sub_data:
continue continue
subtitles.setdefault(sub.get('key', 'en'), []).append({ subtitles.setdefault(sub.get('lang_key', 'en'), []).append({
'ext': 'srt', 'ext': 'srt',
'data': self.json2srt(sub_data) 'data': self.json2srt(sub_data)
}) })
return subtitles return subtitles
def _get_formats(self, type, ep_id): def _get_formats(self, ep_id):
video_json = self._call_api(type, f'/web/playurl?ep_id={ep_id}&platform=web', ep_id) video_json = self._call_api(f'/web/playurl?ep_id={ep_id}&platform=web', ep_id,
if not video_json: note='Downloading video formats', errnote='Unable to download video formats')
if video_json.get('code'):
if video_json['code'] in (10004004, 10004005, 10023006):
self.raise_login_required(method='cookies') self.raise_login_required(method='cookies')
elif video_json['code'] == 10004001:
self.raise_geo_restricted()
elif video_json.get('message') and str(video_json['code']) != video_json['message']:
raise ExtractorError(
f'Unable to download video formats: {self.IE_NAME} said: {video_json["message"]}', expected=True)
else:
raise ExtractorError('Unable to download video formats')
video_json = video_json['playurl'] video_json = video_json['playurl']
formats = [] formats = []
for vid in video_json.get('video', []): for vid in video_json.get('video') or []:
video_res = vid.get('video_resource') or {} video_res = vid.get('video_resource') or {}
video_info = vid.get('stream_info') or {} video_info = vid.get('stream_info') or {}
if not video_res.get('url'): if not video_res.get('url'):
@ -771,7 +783,7 @@ class BiliIntlBaseIE(InfoExtractor):
'vcodec': video_res.get('codecs'), 'vcodec': video_res.get('codecs'),
'filesize': video_res.get('size'), 'filesize': video_res.get('size'),
}) })
for aud in video_json.get('audio_resource', []): for aud in video_json.get('audio_resource') or []:
if not aud.get('url'): if not aud.get('url'):
continue continue
formats.append({ formats.append({
@ -786,85 +798,93 @@ class BiliIntlBaseIE(InfoExtractor):
self._sort_formats(formats) self._sort_formats(formats)
return formats return formats
def _extract_ep_info(self, type, episode_data, ep_id): def _extract_ep_info(self, episode_data, ep_id):
return { return {
'id': ep_id, 'id': ep_id,
'title': episode_data.get('long_title') or episode_data['title'], 'title': episode_data.get('title_display') or episode_data['title'],
'thumbnail': episode_data.get('cover'), 'thumbnail': episode_data.get('cover'),
'episode_number': str_to_int(episode_data.get('title')), 'episode_number': int_or_none(self._search_regex(
'formats': self._get_formats(type, ep_id), r'^E(\d+)(?:$| - )', episode_data.get('title_display'), 'episode number', default=None)),
'subtitles': self._get_subtitles(type, ep_id), 'formats': self._get_formats(ep_id),
'subtitles': self._get_subtitles(ep_id),
'extractor_key': BiliIntlIE.ie_key(), 'extractor_key': BiliIntlIE.ie_key(),
} }
class BiliIntlIE(BiliIntlBaseIE): class BiliIntlIE(BiliIntlBaseIE):
_VALID_URL = r'https?://(?:www\.)?bili(?P<type>bili\.tv|intl.com)/(?:[a-z]{2}/)?play/(?P<season_id>\d+)/(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P<season_id>\d+)/(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.bilibili.tv/en/play/34613/341736', 'url': 'https://www.bilibili.tv/en/play/34613/341736',
'info_dict': { 'info_dict': {
'id': '341736', 'id': '341736',
'ext': 'mp4', 'ext': 'mp4',
'title': 'The First Night', 'title': 'E2 - The First Night',
'thumbnail': 'https://i0.hdslb.com/bfs/intl/management/91e30e5521235d9b163339a26a0b030ebda54310.png', 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
'episode_number': 2, 'episode_number': 2,
}, }
'params': { }, {
'format': 'bv', 'url': 'https://www.bilibili.tv/en/play/1033760/11005006',
}, 'info_dict': {
'id': '11005006',
'ext': 'mp4',
'title': 'E3 - Who?',
'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
'episode_number': 3,
}
}, { }, {
'url': 'https://www.biliintl.com/en/play/34613/341736', 'url': 'https://www.biliintl.com/en/play/34613/341736',
'info_dict': { 'only_matching': True,
'id': '341736',
'ext': 'mp4',
'title': 'The First Night',
'thumbnail': 'https://i0.hdslb.com/bfs/intl/management/91e30e5521235d9b163339a26a0b030ebda54310.png',
'episode_number': 2,
},
'params': {
'format': 'bv',
},
}] }]
def _real_extract(self, url): def _real_extract(self, url):
type, season_id, id = self._match_valid_url(url).groups() season_id, video_id = self._match_valid_url(url).groups()
data_json = self._call_api(type, f'/web/view/ogv_collection?season_id={season_id}', id) webpage = self._download_webpage(url, video_id)
# Bstation layout
initial_data = self._parse_json(self._search_regex(
r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage,
'preload state', default='{}'), video_id, fatal=False) or {}
episode_data = traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict)
if not episode_data:
# Non-Bstation layout, read through episode list
season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id)
episode_data = next( episode_data = next(
episode for episode in data_json.get('episodes', []) episode for episode in traverse_obj(season_json, ('sections', ..., 'episodes', ...), expected_type=dict)
if str(episode.get('ep_id')) == id) if str(episode.get('episode_id')) == video_id)
return self._extract_ep_info(type, episode_data, id) return self._extract_ep_info(episode_data, video_id)
class BiliIntlSeriesIE(BiliIntlBaseIE): class BiliIntlSeriesIE(BiliIntlBaseIE):
_VALID_URL = r'https?://(?:www\.)?bili(?P<type>bili\.tv|intl.com)/(?:[a-z]{2}/)?play/(?P<id>\d+)$' _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P<id>\d+)$'
_TESTS = [{ _TESTS = [{
'url': 'https://www.bilibili.tv/en/play/34613', 'url': 'https://www.bilibili.tv/en/play/34613',
'playlist_mincount': 15, 'playlist_mincount': 15,
'info_dict': { 'info_dict': {
'id': '34613', 'id': '34613',
'title': 'Fly Me to the Moon',
'description': 'md5:a861ee1c4dc0acfad85f557cc42ac627',
'categories': ['Romance', 'Comedy', 'Slice of life'],
'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
'view_count': int,
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
'format': 'bv',
}, },
}, { }, {
'url': 'https://www.biliintl.com/en/play/34613', 'url': 'https://www.biliintl.com/en/play/34613',
'playlist_mincount': 15, 'only_matching': True,
'info_dict': {
'id': '34613',
},
'params': {
'skip_download': True,
'format': 'bv',
},
}] }]
def _entries(self, id, type): def _entries(self, series_id):
data_json = self._call_api(type, f'/web/view/ogv_collection?season_id={id}', id) series_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={series_id}&platform=web', series_id)
for episode in data_json.get('episodes', []): for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict, default=[]):
episode_id = str(episode.get('ep_id')) episode_id = str(episode.get('episode_id'))
yield self._extract_ep_info(type, episode, episode_id) yield self._extract_ep_info(episode, episode_id)
def _real_extract(self, url): def _real_extract(self, url):
type, id = self._match_valid_url(url).groups() series_id = self._match_id(url)
return self.playlist_result(self._entries(id, type), playlist_id=id) series_info = self._call_api(f'/web/v2/ogv/play/season_info?season_id={series_id}&platform=web', series_id).get('season') or {}
return self.playlist_result(
self._entries(series_id), series_id, series_info.get('title'), series_info.get('description'),
categories=traverse_obj(series_info, ('styles', ..., 'title'), expected_type=str_or_none),
thumbnail=url_or_none(series_info.get('horizontal_cover')), view_count=parse_count(series_info.get('view')))