Source code for qetch.extractors.gfycat

# Copyright (c) 2018 Stephen Bunn (stephen@bunn.io)
# MIT License <https://opensource.org/licenses/MIT>

import datetime
from typing import (Any, Tuple, List, Dict, Match, Generator,)

from .. import (exceptions,)
from ..auth import (AuthTypes,)
from ..content import (Content,)
from ._common import (BaseExtractor,)

import furl
import ujson


[docs]class GfycatExtractor(BaseExtractor):
    """ The extractor for links to media from ``gfycat.com``.
    """

    name = 'gfycat'
    description = ('Site which hosts short high-quality video for sharing.')
    authentication = AuthTypes.NONE
    domains = ['gfycat.com']
    handles = {
        'basic': (
            r'^https?://(?:www\.)?gfycat\.com/'
            r'(?:gifs/detail/)?(?P<id>[a-zA-Z]+)/?$'
        ),
        'raw': (
            r'^https?://(?:[a-z]+\.)gfycat\.com/'
            r'(?P<id>[a-zA-Z]+)(?:\.[a-zA-Z0-9]+)$'
        )
    }

    _api_base = 'http://gfycat.com/cajax/get/'
    _content_urls = (
        'mp4Url',
        'webmUrl', 'webpUrl',
        'mobileUrl', 'mobilePosterUrl',
        'posterUrl',
        'thumb360Url', 'thumb360PosterUrl', 'thumb100PosterUrl',
        'max5mbGif', 'max2mbGif',
        'mjpgUrl',
        'miniUrl', 'miniPosterUrl',
        'gifUrl',
    )
    _quality_map = {
        'mp4Url': 1.0,
        'webmUrl': 0.5,
        'gifUrl': 0.25,
    }

    def _get_data(self, id: str) -> Dict[str, Any]:
        """ Gets API data for a specific gfycat id.

        Args:
            id (str): The id of the gfycat content to retrieve.

        Raises:
            exceptions.ExtractionError: When API call results in non 200 status

        Returns:
            dict[str,...]: API data dictionary response.
        """

        query_url = furl.furl(self._api_base).add(path=id)

        response = self.session.get(query_url.url)
        if response.status_code not in (200,):
            raise exceptions.ExtractionError((
                f"error retrieving source for {query_url.url!r}, "
                f"recieved status {response.status_code}"
            ))
        return ujson.loads(response.text).get('gfyItem')

[docs]    def handle_raw(
        self, source: str, match: Match
    ) -> Generator[List[Content], None, None]:
        """ Handles ``raw`` links to gfycat media.

        Args:
            source (str): The source url
            match (Match): The source match regex

        Yields:
            list[Content]: A list of various levels of quality content for \
                the same source url
        """

        data = self._get_data(match.groupdict()['id'])
        yield [Content(
            uid=f'{self.name}-{data["gfyId"]}-{source.split(".")[-1]}',
            source=source,
            fragments=[source],
            extractor=self,
            extension=source.split('.')[-1],
            title=data.get('title'),
            description=data.get('description'),
            quality=1.0,
            uploaded_by=(
                data.get('userName')
                if data.get('userName') != 'anonymous' else
                None
            ),
            uploaded_date=datetime.datetime.fromtimestamp(
                int(data.get('createDate'))
            ),
            metadata=data
        )]

[docs]    def handle_basic(
        self, source: str, match: Match
    ) -> Generator[List[Content], None, None]:
        """ Handles ``basic`` links to gfycat media.

        Args:
            source (str): The source url
            match (Match): The source match regex

        Yields:
            list[Content]: A list of various levels of quality content for \
                the same source url
        """

        data = self._get_data(match.groupdict()['id'])
        # build and yield content list
        content_list = []
        for url_type in self._content_urls:
            if url_type in data:
                content_list.append(Content(
                    uid=f'{self.name}-{data["gfyId"]}-{url_type}',
                    source=source,
                    fragments=[data.get(url_type)],
                    extractor=self,
                    extension=data.get(url_type).split('.')[-1],
                    title=data.get('title'),
                    description=data.get('description'),
                    quality=self._quality_map.get(url_type, 0.0),
                    uploaded_by=(
                        data.get('userName')
                        if data.get('userName') != 'anonymous' else
                        None
                    ),
                    uploaded_date=datetime.datetime.fromtimestamp(
                        int(data.get('createDate'))
                    ),
                    metadata=data
                ))
        yield content_list
Source code for qetch.extractors.gfycat

Table Of Contents

About

Useful Links

Author's Projects

Related Topics