streamlink/src/streamlink/plugins/twitch.py

"""
$description Global live-streaming and video hosting social platform owned by Amazon.
$url twitch.tv
$type live, vod
$notes See the :ref:`Authentication <cli/plugins/twitch:Authentication>` docs on how to prevent ads.
$notes Read more about :ref:`embedded ads <cli/plugins/twitch:Embedded ads>` here.
$notes :ref:`Low latency streaming <cli/plugins/twitch:Low latency streaming>` is supported.
"""

import argparse
import logging
import re
import sys
from datetime import datetime, timedelta
from random import random
from typing import List, NamedTuple, Optional
from urllib.parse import urlparse

from streamlink.exceptions import NoStreamsError, PluginError
from streamlink.plugin import Plugin, pluginargument, pluginmatcher
from streamlink.plugin.api import validate
from streamlink.stream.hls import HLSStream, HLSStreamReader, HLSStreamWorker, HLSStreamWriter
from streamlink.stream.hls_playlist import M3U8, ByteRange, DateRange, ExtInf, Key, M3U8Parser, Map, load as load_hls_playlist
from streamlink.stream.http import HTTPStream
from streamlink.utils.args import keyvalue
from streamlink.utils.parse import parse_json, parse_qsd
from streamlink.utils.times import hours_minutes_seconds
from streamlink.utils.url import update_qsd


log = logging.getLogger(__name__)

LOW_LATENCY_MAX_LIVE_EDGE = 2


class TwitchSegment(NamedTuple):
    uri: str
    duration: float
    title: Optional[str]
    key: Optional[Key]
    discontinuity: bool
    byterange: Optional[ByteRange]
    date: Optional[datetime]
    map: Optional[Map]
    ad: bool
    prefetch: bool


# generic namedtuples are unsupported, so just subclass
class TwitchSequence(NamedTuple):
    num: int
    segment: TwitchSegment


class TwitchM3U8(M3U8):
    segments: List[TwitchSegment]  # type: ignore[assignment]

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.dateranges_ads = []


class TwitchM3U8Parser(M3U8Parser):
    m3u8: TwitchM3U8

    def parse_tag_ext_x_twitch_prefetch(self, value):
        segments = self.m3u8.segments
        if not segments:  # pragma: no cover
            return
        last = segments[-1]
        # Use the average duration of all regular segments for the duration of prefetch segments.
        # This is better than using the duration of the last segment when regular segment durations vary a lot.
        # In low latency mode, the playlist reload time is the duration of the last segment.
        duration = last.duration if last.prefetch else sum(segment.duration for segment in segments) / float(len(segments))
        # Use the last duration for extrapolating the start time of the prefetch segment, which is needed for checking
        # whether it is an ad segment and matches the parsed date ranges or not
        date = last.date + timedelta(seconds=last.duration)
        # Don't pop() the discontinuity state in prefetch segments (at the bottom of the playlist)
        discontinuity = self.state.get("discontinuity", False)
        # Always treat prefetch segments after a discontinuity as ad segments
        ad = discontinuity or self._is_segment_ad(date)
        segment = last._replace(
            uri=self.uri(value),
            duration=duration,
            title=None,
            discontinuity=discontinuity,
            date=date,
            ad=ad,
            prefetch=True,
        )
        segments.append(segment)

    def parse_tag_ext_x_daterange(self, value):
        super().parse_tag_ext_x_daterange(value)
        daterange = self.m3u8.dateranges[-1]
        if self._is_daterange_ad(daterange):
            self.m3u8.dateranges_ads.append(daterange)

    def get_segment(self, uri: str) -> TwitchSegment:  # type: ignore[override]
        extinf: ExtInf = self.state.pop("extinf", None) or ExtInf(0, None)
        date = self.state.pop("date", None)
        ad = self._is_segment_ad(date, extinf.title)

        return TwitchSegment(
            uri=uri,
            duration=extinf.duration,
            title=extinf.title,
            key=self.state.get("key"),
            discontinuity=self.state.pop("discontinuity", False),
            byterange=self.state.pop("byterange", None),
            date=date,
            map=self.state.get("map"),
            ad=ad,
            prefetch=False,
        )

    def _is_segment_ad(self, date: datetime, title: Optional[str] = None) -> bool:
        return (
            title is not None and "Amazon" in title
            or any(self.m3u8.is_date_in_daterange(date, daterange) for daterange in self.m3u8.dateranges_ads)
        )

    @staticmethod
    def _is_daterange_ad(daterange: DateRange) -> bool:
        return (
            daterange.classname == "twitch-stitched-ad"
            or str(daterange.id or "").startswith("stitched-ad-")
            or any(attr_key.startswith("X-TV-TWITCH-AD-") for attr_key in daterange.x.keys())
        )


class TwitchHLSStreamWorker(HLSStreamWorker):
    reader: "TwitchHLSStreamReader"
    writer: "TwitchHLSStreamWriter"
    stream: "TwitchHLSStream"

    def __init__(self, reader, *args, **kwargs):
        self.had_content = False
        super().__init__(reader, *args, **kwargs)

    def _reload_playlist(self, *args):
        return load_hls_playlist(*args, parser=TwitchM3U8Parser, m3u8=TwitchM3U8)

    def _playlist_reload_time(self, playlist: TwitchM3U8, sequences: List[TwitchSequence]):  # type: ignore[override]
        if self.stream.low_latency and sequences:
            return sequences[-1].segment.duration

        return super()._playlist_reload_time(playlist, sequences)  # type: ignore[arg-type]

    def process_sequences(self, playlist: TwitchM3U8, sequences: List[TwitchSequence]):  # type: ignore[override]
        # ignore prefetch segments if not LL streaming
        if not self.stream.low_latency:
            sequences = [seq for seq in sequences if not seq.segment.prefetch]

        # check for sequences with real content
        if not self.had_content:
            self.had_content = next((True for seq in sequences if not seq.segment.ad), False)

            # When filtering ads, to check whether it's a LL stream, we need to wait for the real content to show up,
            # since playlists with only ad segments don't contain prefetch segments
            if (
                self.stream.low_latency
                and self.had_content
                and not next((True for seq in sequences if seq.segment.prefetch), False)
            ):
                log.info("This is not a low latency stream")

        # show pre-roll ads message only on the first playlist containing ads
        if self.stream.disable_ads and self.playlist_sequence == -1 and not self.had_content:
            log.info("Waiting for pre-roll ads to finish, be patient")

        return super().process_sequences(playlist, sequences)  # type: ignore[arg-type]


class TwitchHLSStreamWriter(HLSStreamWriter):
    reader: "TwitchHLSStreamReader"
    stream: "TwitchHLSStream"

    def should_filter_sequence(self, sequence: TwitchSequence):  # type: ignore[override]
        return self.stream.disable_ads and sequence.segment.ad


class TwitchHLSStreamReader(HLSStreamReader):
    __worker__ = TwitchHLSStreamWorker
    __writer__ = TwitchHLSStreamWriter

    worker: "TwitchHLSStreamWorker"
    writer: "TwitchHLSStreamWriter"
    stream: "TwitchHLSStream"

    def __init__(self, stream: "TwitchHLSStream"):
        if stream.disable_ads:
            log.info("Will skip ad segments")
        if stream.low_latency:
            live_edge = max(1, min(LOW_LATENCY_MAX_LIVE_EDGE, stream.session.options.get("hls-live-edge")))
            stream.session.options.set("hls-live-edge", live_edge)
            stream.session.options.set("hls-segment-stream-data", True)
            log.info(f"Low latency streaming (HLS live edge: {live_edge})")
        super().__init__(stream)


class TwitchHLSStream(HLSStream):
    __reader__ = TwitchHLSStreamReader

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.disable_ads = self.session.get_plugin_option("twitch", "disable-ads")
        self.low_latency = self.session.get_plugin_option("twitch", "low-latency")


class UsherService:
    def __init__(self, session):
        self.session = session

    def _create_url(self, endpoint, **extra_params):
        url = f"https://usher.ttvnw.net{endpoint}"
        params = {
            "player": "twitchweb",
            "p": int(random() * 999999),
            "type": "any",
            "allow_source": "true",
            "allow_audio_only": "true",
            "allow_spectre": "false",
        }
        params.update(extra_params)

        req = self.session.http.prepare_new_request(url=url, params=params)

        return req.url

    def channel(self, channel, **extra_params):
        try:
            extra_params_debug = validate.Schema(
                validate.get("token"),
                validate.parse_json(),
                {
                    "adblock": bool,
                    "geoblock_reason": str,
                    "hide_ads": bool,
                    "server_ads": bool,
                    "show_ads": bool,
                },
            ).validate(extra_params)
            log.debug(f"{extra_params_debug!r}")
        except PluginError:
            pass
        return self._create_url(f"/api/channel/hls/{channel}.m3u8", **extra_params)

    def video(self, video_id, **extra_params):
        return self._create_url(f"/vod/{video_id}", **extra_params)


class TwitchAPI:
    CLIENT_ID = "kimne78kx3ncx6brgo4mv6wki5h1ko"

    def __init__(self, session):
        self.session = session
        self.headers = {
            "Client-ID": self.CLIENT_ID,
        }
        self.headers.update(**dict(session.get_plugin_option("twitch", "api-header") or []))
        self.access_token_params = dict(session.get_plugin_option("twitch", "access-token-param") or [])
        self.access_token_params.setdefault("playerType", "embed")

    def call(self, data, schema=None, **kwargs):
        res = self.session.http.post(
            "https://gql.twitch.tv/gql",
            json=data,
            headers={**self.headers, **kwargs.pop("headers", {})},
            **kwargs,
        )

        return self.session.http.json(res, schema=schema)

    @staticmethod
    def _gql_persisted_query(operationname, sha256hash, **variables):
        return {
            "operationName": operationname,
            "extensions": {
                "persistedQuery": {
                    "version": 1,
                    "sha256Hash": sha256hash,
                },
            },
            "variables": dict(**variables),
        }

    @staticmethod
    def parse_token(tokenstr):
        return parse_json(tokenstr, schema=validate.Schema(
            {"chansub": {"restricted_bitrates": validate.all(
                [str],
                validate.filter(lambda n: not re.match(r"(.+_)?archives|live|chunked", n)),
            )}},
            validate.get(("chansub", "restricted_bitrates")),
        ))

    # GraphQL API calls

    def metadata_video(self, video_id):
        query = self._gql_persisted_query(
            "VideoMetadata",
            "cb3b1eb2f2d2b2f65b8389ba446ec521d76c3aa44f5424a1b1d235fe21eb4806",
            channelLogin="",  # parameter can be empty
            videoID=video_id,
        )

        return self.call(query, schema=validate.Schema(
            {"data": {"video": {
                "id": str,
                "owner": {
                    "displayName": str,
                },
                "title": str,
                "game": {
                    "displayName": str,
                },
            }}},
            validate.get(("data", "video")),
            validate.union_get(
                "id",
                ("owner", "displayName"),
                ("game", "displayName"),
                "title",
            ),
        ))

    def metadata_channel(self, channel):
        queries = [
            self._gql_persisted_query(
                "ChannelShell",
                "c3ea5a669ec074a58df5c11ce3c27093fa38534c94286dc14b68a25d5adcbf55",
                login=channel,
                lcpVideosEnabled=False,
            ),
            self._gql_persisted_query(
                "StreamMetadata",
                "059c4653b788f5bdb2f5a2d2a24b0ddc3831a15079001a3d927556a96fb0517f",
                channelLogin=channel,
            ),
        ]

        return self.call(queries, schema=validate.Schema(
            [
                validate.all(
                    {"data": {"userOrError": {
                        "displayName": str,
                    }}},
                ),
                validate.all(
                    {"data": {"user": {
                        "lastBroadcast": {
                            "title": str,
                        },
                        "stream": {
                            "id": str,
                            "game": {
                                "name": str,
                            },
                        },
                    }}},
                ),
            ],
            validate.union_get(
                (1, "data", "user", "stream", "id"),
                (0, "data", "userOrError", "displayName"),
                (1, "data", "user", "stream", "game", "name"),
                (1, "data", "user", "lastBroadcast", "title"),
            ),
        ))

    def metadata_clips(self, clipname):
        queries = [
            self._gql_persisted_query(
                "ClipsView",
                "4480c1dcc2494a17bb6ef64b94a5213a956afb8a45fe314c66b0d04079a93a8f",
                slug=clipname,
            ),
            self._gql_persisted_query(
                "ClipsTitle",
                "f6cca7f2fdfbfc2cecea0c88452500dae569191e58a265f97711f8f2a838f5b4",
                slug=clipname,
            ),
        ]

        return self.call(queries, schema=validate.Schema(
            [
                validate.all(
                    {"data": {"clip": {
                        "id": str,
                        "broadcaster": {"displayName": str},
                        "game": {"name": str},
                    }}},
                    validate.get(("data", "clip")),
                ),
                validate.all(
                    {"data": {"clip": {"title": str}}},
                    validate.get(("data", "clip")),
                ),
            ],
            validate.union_get(
                (0, "id"),
                (0, "broadcaster", "displayName"),
                (0, "game", "name"),
                (1, "title"),
            ),
        ))

    def access_token(self, is_live, channel_or_vod):
        query = self._gql_persisted_query(
            "PlaybackAccessToken",
            "0828119ded1c13477966434e15800ff57ddacf13ba1911c129dc2200705b0712",
            isLive=is_live,
            login=channel_or_vod if is_live else "",
            isVod=not is_live,
            vodID=channel_or_vod if not is_live else "",
            **self.access_token_params,
        )
        subschema = validate.none_or_all(
            {
                "value": str,
                "signature": str,
            },
            validate.union_get("signature", "value"),
        )

        return self.call(query, acceptable_status=(200, 400, 401, 403), schema=validate.Schema(
            validate.any(
                validate.all(
                    {"error": str, "message": str},
                    validate.union_get("error", "message"),
                    validate.transform(lambda data: ("error", *data)),
                ),
                validate.all(
                    {
                        "data": validate.any(
                            validate.all(
                                {"streamPlaybackAccessToken": subschema},
                                validate.get("streamPlaybackAccessToken"),
                            ),
                            validate.all(
                                {"videoPlaybackAccessToken": subschema},
                                validate.get("videoPlaybackAccessToken"),
                            ),
                        ),
                    },
                    validate.get("data"),
                    validate.transform(lambda data: ("token", *data)),
                ),
            ),
        ))

    def clips(self, clipname):
        query = self._gql_persisted_query(
            "VideoAccessToken_Clip",
            "36b89d2507fce29e5ca551df756d27c1cfe079e2609642b4390aa4c35796eb11",
            slug=clipname,
        )

        return self.call(query, schema=validate.Schema(
            {"data": {"clip": {
                "playbackAccessToken": {
                    "signature": str,
                    "value": str,
                },
                "videoQualities": [validate.all(
                    {
                        "frameRate": validate.transform(int),
                        "quality": str,
                        "sourceURL": validate.url(),
                    },
                    validate.transform(lambda q: (
                        f"{q['quality']}p{q['frameRate']}",
                        q["sourceURL"],
                    )),
                )],
            }}},
            validate.get(("data", "clip")),
            validate.union_get(
                ("playbackAccessToken", "signature"),
                ("playbackAccessToken", "value"),
                "videoQualities",
            ),
        ))

    def stream_metadata(self, channel):
        query = self._gql_persisted_query(
            "StreamMetadata",
            "1c719a40e481453e5c48d9bb585d971b8b372f8ebb105b17076722264dfa5b3e",
            channelLogin=channel,
        )

        return self.call(query, schema=validate.Schema(
            {"data": {"user": {"stream": {"type": str}}}},
            validate.get(("data", "user", "stream")),
        ))


@pluginmatcher(re.compile(r"""
    https?://(?:(?P<subdomain>[\w-]+)\.)?twitch\.tv/
    (?:
        videos/(?P<videos_id>\d+)
        |
        (?P<channel>[^/?]+)
        (?:
            /v(?:ideo)?/(?P<video_id>\d+)
            |
            /clip/(?P<clip_name>[^/?]+)
        )?
    )
""", re.VERBOSE))
@pluginargument(
    "disable-ads",
    action="store_true",
    help="""
        Skip embedded advertisement segments at the beginning or during a stream.
        Will cause these segments to be missing from the output.
    """,
)
@pluginargument(
    "disable-hosting",
    action="store_true",
    help=argparse.SUPPRESS,
)
@pluginargument(
    "disable-reruns",
    action="store_true",
    help="Do not open the stream if the target channel is currently broadcasting a rerun.",
)
@pluginargument(
    "low-latency",
    action="store_true",
    help=f"""
        Enables low latency streaming by prefetching HLS segments.
        Sets --hls-segment-stream-data to true and --hls-live-edge to `{LOW_LATENCY_MAX_LIVE_EDGE}`, if it is higher.
        Reducing --hls-live-edge to `1` will result in the lowest latency possible, but will most likely cause buffering.

        In order to achieve true low latency streaming during playback, the player's caching/buffering settings will
        need to be adjusted and reduced to a value as low as possible, but still high enough to not cause any buffering.
        This depends on the stream's bitrate and the quality of the connection to Twitch's servers. Please refer to the
        player's own documentation for the required configuration. Player parameters can be set via --player-args.

        Note: Low latency streams have to be enabled by the broadcasters on Twitch themselves.
        Regular streams can cause buffering issues with this option enabled due to the reduced --hls-live-edge value.
    """,
)
@pluginargument(
    "api-header",
    metavar="KEY=VALUE",
    type=keyvalue,
    action="append",
    help="""
        A header to add to each Twitch API HTTP request.

        Can be repeated to add multiple headers.

        Useful for adding authentication data that can prevent ads. See the plugin-specific documentation for more information.
    """,
)
@pluginargument(
    "access-token-param",
    metavar="KEY=VALUE",
    type=keyvalue,
    action="append",
    help="""
        A parameter to add to the API request for acquiring the streaming access token.

        Can be repeated to add multiple parameters.
    """,
)
class Twitch(Plugin):
    @classmethod
    def stream_weight(cls, stream):
        if stream == "source":
            return sys.maxsize, stream
        return super().stream_weight(stream)

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        match = self.match.groupdict()
        parsed = urlparse(self.url)
        self.params = parse_qsd(parsed.query)
        self.subdomain = match.get("subdomain")
        self.video_id = None
        self.channel = None
        self.clip_name = None
        self._checked_metadata = False

        if self.subdomain == "player":
            # pop-out player
            if self.params.get("video"):
                self.video_id = self.params["video"]
            self.channel = self.params.get("channel")
        elif self.subdomain == "clips":
            # clip share URL
            self.clip_name = match.get("channel")
        else:
            self.channel = match.get("channel") and match.get("channel").lower()
            self.video_id = match.get("video_id") or match.get("videos_id")
            self.clip_name = match.get("clip_name")

        self.api = TwitchAPI(session=self.session)
        self.usher = UsherService(session=self.session)

        def method_factory(parent_method):
            def inner():
                if not self._checked_metadata:
                    self._checked_metadata = True
                    self._get_metadata()
                return parent_method()
            return inner

        parent = super()
        for metadata in "id", "author", "category", "title":
            method = f"get_{metadata}"
            setattr(self, method, method_factory(getattr(parent, method)))

    def _get_metadata(self):
        try:
            if self.video_id:
                data = self.api.metadata_video(self.video_id)
            elif self.clip_name:
                data = self.api.metadata_clips(self.clip_name)
            elif self.channel:
                data = self.api.metadata_channel(self.channel)
            else:  # pragma: no cover
                return
            self.id, self.author, self.category, self.title = data
        except (PluginError, TypeError):
            pass

    def _access_token(self, is_live, channel_or_vod):
        try:
            response, *data = self.api.access_token(is_live, channel_or_vod)
            if response != "token":
                error, message = data
                log.error(f"{error or 'Error'}: {message or 'Unknown error'}")
                raise PluginError
            sig, token = data
        except (PluginError, TypeError):
            raise NoStreamsError  # noqa: B904

        try:
            restricted_bitrates = self.api.parse_token(token)
        except PluginError:
            restricted_bitrates = []

        return sig, token, restricted_bitrates

    def _check_for_rerun(self):
        if not self.options.get("disable_reruns"):
            return False

        try:
            stream = self.api.stream_metadata(self.channel)
            if stream["type"] != "live":
                log.info("Reruns were disabled by command line option")
                return True
        except (PluginError, TypeError):
            pass

        return False

    def _get_hls_streams_live(self):
        if self._check_for_rerun():
            return

        # only get the token once the channel has been resolved
        log.debug(f"Getting live HLS streams for {self.channel}")
        self.session.http.headers.update({
            "referer": "https://player.twitch.tv",
            "origin": "https://player.twitch.tv",
        })
        sig, token, restricted_bitrates = self._access_token(True, self.channel)
        url = self.usher.channel(self.channel, sig=sig, token=token, fast_bread=True)

        return self._get_hls_streams(url, restricted_bitrates)

    def _get_hls_streams_video(self):
        log.debug(f"Getting HLS streams for video ID {self.video_id}")
        sig, token, restricted_bitrates = self._access_token(False, self.video_id)
        url = self.usher.video(self.video_id, nauthsig=sig, nauth=token)

        # If the stream is a VOD that is still being recorded, the stream should start at the beginning of the recording
        return self._get_hls_streams(url, restricted_bitrates, force_restart=True)

    def _get_hls_streams(self, url, restricted_bitrates, **extra_params):
        time_offset = self.params.get("t", 0)
        if time_offset:
            try:
                time_offset = hours_minutes_seconds(time_offset)
            except ValueError:
                time_offset = 0

        try:
            streams = TwitchHLSStream.parse_variant_playlist(self.session, url, start_offset=time_offset, **extra_params)
        except OSError as err:
            err = str(err)
            if "404 Client Error" in err or "Failed to parse playlist" in err:
                return
            else:
                raise PluginError(err) from err

        for name in restricted_bitrates:
            if name not in streams:
                log.warning(f"The quality '{name}' is not available since it requires a subscription.")

        return streams

    def _get_clips(self):
        try:
            sig, token, streams = self.api.clips(self.clip_name)
        except (PluginError, TypeError):
            return

        for quality, stream in streams:
            yield quality, HTTPStream(self.session, update_qsd(stream, {"sig": sig, "token": token}))

    def _get_streams(self):
        if self.video_id:
            return self._get_hls_streams_video()
        elif self.clip_name:
            return self._get_clips()
        elif self.channel:
            return self._get_hls_streams_live()


__plugin__ = Twitch