streamlink/src/streamlink/plugins/youtube.py

"""
$description Global live-streaming and video hosting social platform owned by Google.
$url youtube.com
$url youtu.be
$type live, vod
$notes Protected videos are not supported
"""

import json
import logging
import re
from urllib.parse import urlparse, urlunparse

from streamlink.plugin import Plugin, PluginError, pluginmatcher
from streamlink.plugin.api import useragents, validate
from streamlink.stream.ffmpegmux import MuxedStream
from streamlink.stream.hls import HLSStream
from streamlink.stream.http import HTTPStream
from streamlink.utils.data import search_dict
from streamlink.utils.parse import parse_json


log = logging.getLogger(__name__)


@pluginmatcher(name="default", pattern=re.compile(
    r"https?://(?:\w+\.)?youtube\.com/(?:v/|live/|watch\?(?:.*&)?v=)(?P<video_id>[\w-]{11})",
))
@pluginmatcher(name="channel", pattern=re.compile(
    r"https?://(?:\w+\.)?youtube\.com/(?:@|c(?:hannel)?/|user/)?(?P<channel>[^/?]+)(?P<live>/live)?/?$",
))
@pluginmatcher(name="embed", pattern=re.compile(
    r"https?://(?:\w+\.)?youtube\.com/embed/(?:live_stream\?channel=(?P<live>[^/?&]+)|(?P<video_id>[\w-]{11}))",
))
@pluginmatcher(name="shorthand", pattern=re.compile(
    r"https?://youtu\.be/(?P<video_id>[\w-]{11})",
))
class YouTube(Plugin):
    _re_ytInitialData = re.compile(r"""var\s+ytInitialData\s*=\s*({.*?})\s*;\s*</script>""", re.DOTALL)
    _re_ytInitialPlayerResponse = re.compile(r"""var\s+ytInitialPlayerResponse\s*=\s*({.*?});\s*var\s+\w+\s*=""", re.DOTALL)

    _url_canonical = "https://www.youtube.com/watch?v={video_id}"
    _url_channelid_live = "https://www.youtube.com/channel/{channel_id}/live"

    # There are missing itags
    adp_video = {
        137: "1080p",
        299: "1080p60",  # HFR
        264: "1440p",
        308: "1440p60",  # HFR
        266: "2160p",
        315: "2160p60",  # HFR
        138: "2160p",
        302: "720p60",  # HFR
        135: "480p",
        133: "240p",
        160: "144p",
    }
    adp_audio = {
        140: 128,
        141: 256,
        171: 128,
        249: 48,
        250: 64,
        251: 160,
        256: 256,
        258: 258,
    }

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        parsed = urlparse(self.url)

        # translate input URLs to be able to find embedded data and to avoid unnecessary HTTP redirects
        if parsed.netloc == "gaming.youtube.com":
            self.url = urlunparse(parsed._replace(scheme="https", netloc="www.youtube.com"))
        elif self.matches["shorthand"]:
            self.url = self._url_canonical.format(video_id=self.match["video_id"])
        elif self.matches["embed"] and self.match["video_id"]:
            self.url = self._url_canonical.format(video_id=self.match["video_id"])
        elif self.matches["embed"] and self.match["live"]:
            self.url = self._url_channelid_live.format(channel_id=self.match["live"])
        elif parsed.scheme != "https":
            self.url = urlunparse(parsed._replace(scheme="https"))

        self.session.http.headers.update({"User-Agent": useragents.CHROME})

    @classmethod
    def stream_weight(cls, stream):
        match_3d = re.match(r"(\w+)_3d", stream)
        match_hfr = re.match(r"(\d+p)(\d+)", stream)
        if match_3d:
            weight, group = Plugin.stream_weight(match_3d.group(1))
            weight -= 1
            group = "youtube_3d"
        elif match_hfr:
            weight, group = Plugin.stream_weight(match_hfr.group(1))
            weight += 1
            group = "high_frame_rate"
        else:
            weight, group = Plugin.stream_weight(stream)

        return weight, group

    @staticmethod
    def _schema_consent(data):
        schema_consent = validate.Schema(
            validate.parse_html(),
            validate.any(
                validate.xml_find(".//form[@action='https://consent.youtube.com/s']"),
                validate.all(
                    validate.xml_xpath(".//form[@action='https://consent.youtube.com/save']"),
                    validate.filter(lambda elem: elem.xpath(".//input[@type='hidden'][@name='set_ytc'][@value='true']")),
                    validate.get(0),
                ),
            ),
            validate.union((
                validate.get("action"),
                validate.xml_xpath(".//input[@type='hidden']"),
            )),
        )
        return schema_consent.validate(data)

    def _schema_canonical(self, data):
        schema_canonical = validate.Schema(
            validate.parse_html(),
            validate.xml_xpath_string(".//link[@rel='canonical'][1]/@href"),
            validate.regex(self.matchers["default"].pattern),
            validate.get("video_id"),
        )
        return schema_canonical.validate(data)

    @classmethod
    def _schema_playabilitystatus(cls, data):
        schema = validate.Schema(
            {"playabilityStatus": {
                "status": str,
                validate.optional("reason"): str,
            }},
            validate.get("playabilityStatus"),
            validate.union_get("status", "reason"),
        )
        return schema.validate(data)

    @classmethod
    def _schema_videodetails(cls, data):
        schema = validate.Schema(
            {
                "videoDetails": {
                    "videoId": str,
                    "author": str,
                    "title": str,
                    validate.optional("isLive"): validate.transform(bool),
                    validate.optional("isLiveContent"): validate.transform(bool),
                    validate.optional("isLiveDvrEnabled"): validate.transform(bool),
                    validate.optional("isLowLatencyLiveStream"): validate.transform(bool),
                    validate.optional("isPrivate"): validate.transform(bool),
                },
                "microformat": validate.all(
                    validate.any(
                        validate.all(
                            {"playerMicroformatRenderer": dict},
                            validate.get("playerMicroformatRenderer"),
                        ),
                        validate.all(
                            {"microformatDataRenderer": dict},
                            validate.get("microformatDataRenderer"),
                        ),
                    ),
                    {
                        "category": str,
                    },
                ),
            },
            validate.union_get(
                ("videoDetails", "videoId"),
                ("videoDetails", "author"),
                ("microformat", "category"),
                ("videoDetails", "title"),
                ("videoDetails", "isLive"),
            ),
        )
        videoDetails = schema.validate(data)
        log.trace(f"videoDetails = {videoDetails!r}")
        return videoDetails

    @classmethod
    def _schema_streamingdata(cls, data):
        schema = validate.Schema(
            {"streamingData": {
                validate.optional("hlsManifestUrl"): str,
                validate.optional("formats"): [validate.all(
                    {
                        "itag": int,
                        "qualityLabel": str,
                        validate.optional("url"): validate.url(scheme="http"),
                    },
                    validate.union_get("url", "qualityLabel"),
                )],
                validate.optional("adaptiveFormats"): [validate.all(
                    {
                        "itag": int,
                        "mimeType": validate.all(
                            str,
                            validate.regex(re.compile(r"""^(?P<type>\w+)/(?P<container>\w+); codecs="(?P<codecs>.+)"$""")),
                            validate.union_get("type", "codecs"),
                        ),
                        validate.optional("url"): validate.url(scheme="http"),
                        validate.optional("qualityLabel"): str,
                    },
                    validate.union_get("url", "qualityLabel", "itag", "mimeType"),
                )],
            }},
            validate.get("streamingData"),
            validate.union_get("hlsManifestUrl", "formats", "adaptiveFormats"),
        )
        hls_manifest, formats, adaptive_formats = schema.validate(data)
        return hls_manifest, formats or [], adaptive_formats or []

    def _create_adaptive_streams(self, adaptive_formats):
        streams = {}
        adaptive_streams = {}
        audio_streams = {}
        best_audio_itag = None

        # Extract audio streams from the adaptive format list
        for url, _label, itag, mimeType in adaptive_formats:
            if url is None:
                continue

            # extract any high quality streams only available in adaptive formats
            adaptive_streams[itag] = url
            stream_type, stream_codec = mimeType
            stream_codec = re.sub(r"^(\w+).*$", r"\1", stream_codec)

            if stream_type == "audio" and itag in self.adp_audio:
                audio_bitrate = self.adp_audio[itag]
                if stream_codec not in audio_streams or audio_bitrate > self.adp_audio[audio_streams[stream_codec]]:
                    audio_streams[stream_codec] = itag

                # find the best quality audio stream m4a, opus or vorbis
                if best_audio_itag is None or audio_bitrate > self.adp_audio[best_audio_itag]:
                    best_audio_itag = itag

        streams.update({
            f"audio_{stream_codec}": HTTPStream(self.session, adaptive_streams[itag])
            for stream_codec, itag in audio_streams.items()
        })

        if best_audio_itag and adaptive_streams and MuxedStream.is_usable(self.session):
            aurl = adaptive_streams[best_audio_itag]
            for itag, name in self.adp_video.items():
                if itag not in adaptive_streams:
                    continue
                vurl = adaptive_streams[itag]
                log.debug(f"MuxedStream: v {itag} a {best_audio_itag} = {name}")
                streams[name] = MuxedStream(
                    self.session,
                    HTTPStream(self.session, vurl),
                    HTTPStream(self.session, aurl),
                )

        return streams

    def _get_res(self, url):
        res = self.session.http.get(url)
        if urlparse(res.url).netloc == "consent.youtube.com":
            target, elems = self._schema_consent(res.text)
            c_data = {
                elem.attrib.get("name"): elem.attrib.get("value")
                for elem in elems
            }
            log.debug(f"consent target: {target}")
            log.debug(f"consent data: {', '.join(c_data.keys())}")
            res = self.session.http.post(target, data=c_data)
        return res

    @staticmethod
    def _get_data_from_regex(res, regex, descr):
        match = re.search(regex, res.text)
        if not match:
            log.debug(f"Missing {descr}")
            return
        return parse_json(match.group(1))

    def _get_data_from_api(self, res):
        try:
            _i_video_id = self.match["video_id"]
        except IndexError:
            _i_video_id = None

        if _i_video_id is None:
            try:
                _i_video_id = self._schema_canonical(res.text)
            except (PluginError, TypeError):
                return

        try:
            _i_api_key = re.search(r'"INNERTUBE_API_KEY":\s*"([^"]+)"', res.text).group(1)
        except AttributeError:
            _i_api_key = "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8"

        try:
            _i_version = re.search(r'"INNERTUBE_CLIENT_VERSION":\s*"([\d\.]+)"', res.text).group(1)
        except AttributeError:
            _i_version = "1.20210616.1.0"

        res = self.session.http.post(
            "https://www.youtube.com/youtubei/v1/player",
            headers={"Content-Type": "application/json"},
            params={"key": _i_api_key},
            data=json.dumps({
                "videoId": _i_video_id,
                "contentCheckOk": True,
                "racyCheckOk": True,
                "context": {
                    "client": {
                        "clientName": "WEB",
                        "clientVersion": _i_version,
                        "platform": "DESKTOP",
                        "clientScreen": "EMBED",
                        "clientFormFactor": "UNKNOWN_FORM_FACTOR",
                        "browserName": "Chrome",
                    },
                    "user": {"lockedSafetyMode": "false"},
                    "request": {"useSsl": "true"},
                },
            }),
        )
        return parse_json(res.text)

    @staticmethod
    def _data_video_id(data):
        if data:
            for videoRenderer in search_dict(data, "videoRenderer"):
                videoId = videoRenderer.get("videoId")
                if videoId is not None:
                    return videoId

    def _data_status(self, data, errorlog=False):
        if not data:
            return False
        status, reason = self._schema_playabilitystatus(data)
        if status != "OK":
            if errorlog:
                log.error(f"Could not get video info - {status}: {reason}")
            return False
        return True

    def _get_streams(self):
        res = self._get_res(self.url)

        if self.matches["channel"] and not self.match["live"]:
            initial = self._get_data_from_regex(res, self._re_ytInitialData, "initial data")
            video_id = self._data_video_id(initial)
            if video_id is None:
                log.error("Could not find videoId on channel page")
                return
            self.url = self._url_canonical.format(video_id=video_id)
            res = self._get_res(self.url)

        data = self._get_data_from_regex(res, self._re_ytInitialPlayerResponse, "initial player response")
        if not self._data_status(data):
            data = self._get_data_from_api(res)
            if not self._data_status(data, True):
                return

        self.id, self.author, self.category, self.title, is_live = self._schema_videodetails(data)
        log.debug(f"Using video ID: {self.id}")

        if is_live:
            log.debug("This video is live.")

        streams = {}
        hls_manifest, formats, adaptive_formats = self._schema_streamingdata(data)

        protected = next((True for url, *_ in formats + adaptive_formats if url is None), False)
        if protected:
            log.debug("This video may be protected.")

        for url, label in formats:
            if url is None:
                continue
            streams[label] = HTTPStream(self.session, url)

        if not is_live:
            streams.update(self._create_adaptive_streams(adaptive_formats))

        if hls_manifest:
            streams.update(HLSStream.parse_variant_playlist(self.session, hls_manifest, name_key="pixels"))

        if not streams and protected:
            raise PluginError("This plugin does not support protected videos, try youtube-dl instead")

        return streams


__plugin__ = YouTube