Source code for qetch.extractors.gfycat

# Copyright (c) 2018 Stephen Bunn (stephen@bunn.io)
# MIT License <https://opensource.org/licenses/MIT>

import datetime
from typing import (Any, Tuple, List, Dict, Match, Generator,)

from .. import (exceptions,)
from ..auth import (AuthTypes,)
from ..content import (Content,)
from ._common import (BaseExtractor,)

import furl
import ujson


[docs]class GfycatExtractor(BaseExtractor): """ The extractor for links to media from ``gfycat.com``. """ name = 'gfycat' description = ('Site which hosts short high-quality video for sharing.') authentication = AuthTypes.NONE domains = ['gfycat.com'] handles = { 'basic': ( r'^https?://(?:www\.)?gfycat\.com/' r'(?:gifs/detail/)?(?P<id>[a-zA-Z]+)/?$' ), 'raw': ( r'^https?://(?:[a-z]+\.)gfycat\.com/' r'(?P<id>[a-zA-Z]+)(?:\.[a-zA-Z0-9]+)$' ) } _api_base = 'http://gfycat.com/cajax/get/' _content_urls = ( 'mp4Url', 'webmUrl', 'webpUrl', 'mobileUrl', 'mobilePosterUrl', 'posterUrl', 'thumb360Url', 'thumb360PosterUrl', 'thumb100PosterUrl', 'max5mbGif', 'max2mbGif', 'mjpgUrl', 'miniUrl', 'miniPosterUrl', 'gifUrl', ) _quality_map = { 'mp4Url': 1.0, 'webmUrl': 0.5, 'gifUrl': 0.25, } def _get_data(self, id: str) -> Dict[str, Any]: """ Gets API data for a specific gfycat id. Args: id (str): The id of the gfycat content to retrieve. Raises: exceptions.ExtractionError: When API call results in non 200 status Returns: dict[str,...]: API data dictionary response. """ query_url = furl.furl(self._api_base).add(path=id) response = self.session.get(query_url.url) if response.status_code not in (200,): raise exceptions.ExtractionError(( f"error retrieving source for {query_url.url!r}, " f"recieved status {response.status_code}" )) return ujson.loads(response.text).get('gfyItem')
[docs] def handle_raw( self, source: str, match: Match ) -> Generator[List[Content], None, None]: """ Handles ``raw`` links to gfycat media. Args: source (str): The source url match (Match): The source match regex Yields: list[Content]: A list of various levels of quality content for \ the same source url """ data = self._get_data(match.groupdict()['id']) yield [Content( uid=f'{self.name}-{data["gfyId"]}-{source.split(".")[-1]}', source=source, fragments=[source], extractor=self, extension=source.split('.')[-1], title=data.get('title'), description=data.get('description'), quality=1.0, uploaded_by=( data.get('userName') if data.get('userName') != 'anonymous' else None ), uploaded_date=datetime.datetime.fromtimestamp( int(data.get('createDate')) ), metadata=data
)]
[docs] def handle_basic( self, source: str, match: Match ) -> Generator[List[Content], None, None]: """ Handles ``basic`` links to gfycat media. Args: source (str): The source url match (Match): The source match regex Yields: list[Content]: A list of various levels of quality content for \ the same source url """ data = self._get_data(match.groupdict()['id']) # build and yield content list content_list = [] for url_type in self._content_urls: if url_type in data: content_list.append(Content( uid=f'{self.name}-{data["gfyId"]}-{url_type}', source=source, fragments=[data.get(url_type)], extractor=self, extension=data.get(url_type).split('.')[-1], title=data.get('title'), description=data.get('description'), quality=self._quality_map.get(url_type, 0.0), uploaded_by=( data.get('userName') if data.get('userName') != 'anonymous' else None ), uploaded_date=datetime.datetime.fromtimestamp( int(data.get('createDate')) ), metadata=data ))
yield content_list