Source code for qetch.extractors.imgur

# Copyright (c) 2018 Stephen Bunn (stephen@bunn.io)
# MIT License <https://opensource.org/licenses/MIT>

import datetime
from typing import (Any, Tuple, List, Dict, Match, Generator,)

from .. import (exceptions,)
from ..auth import (AuthTypes,)
from ..content import (Content,)
from ._common import (BaseExtractor,)

import furl
import ujson


[docs]class ImgurExtractor(BaseExtractor):
    """ The extractor for links to media from ``imgur.com``.
    """

    name = 'imgur'
    description = ('Dedicated image host originally built for Reddit.')
    authentication = AuthTypes.OAUTH
    domains = ['imgur.com', 'i.imgur.com']
    handles = {
        'basic': (
            r'^https?://(?:www\.)?imgur\.com/(?P<id>[a-zA-Z0-9]+)/?$'
        ),
        'album': (
            r'^https?://(?:www\.)?imgur\.com/'
            r'(?:a|gallery)/(?P<id>[a-zA-Z0-9]+)/?$'
        ),
        'raw': (
            r'^https?://(?:www\.)?(?:[a-z]\.)imgur\.com/'
            r'(?P<id>[a-zA-Z0-9]+)\..*$'
        )
    }

    _api_base = 'https://api.imgur.com/3'
    _content_urls = (
        'mp4',
        'gifv',
        'link',
    )
    _quality_map = {
        'mp4': 1.0,
        'gifv': 0.5,
        'link': 0.0,
    }

    def _get_data(
        self, id: str,
        is_album: bool=False, is_raw: bool=False
    ) -> Dict[str, Any]:
        """ Gets API data for a specific imgur id.

        Args:
            id (str): The id of the imgur content to retrieve.
            is_album (bool, optional): If True, indicates that id is that of
                an album.
            is_raw (bool, optional): If True, indicates that id is that of
                some raw imgur link.

        Raises:
            exceptions.ExtractionError: When API call results in non 200 status

        Returns:
            dict[str,....]: API data dictionary response
        """

        query_url = furl.furl(self._api_base).add(
            path=(
                f'{"/gallery" if is_raw else ""}'
                f'{"album" if is_album else "image"}/{id}'
            )
        )
        response = self.session.get(query_url.url)
        if response.status_code not in (200,):
            raise exceptions.ExtractionError((
                f"error retrieving source for {query_url.url!r} "
                f"recieved status {response.status_code}"
            ))
        return ujson.loads(response.text).get('data')

[docs]    def handle_basic(
        self, source: str, match: Match
    ) -> Generator[List[Content], None, None]:
        """ Handles ``basic`` links to imgur media.

        Args:
            source (str): The source url
            match (Match): The source match regex

        Yields:
            list[Content]: A list of various levels of quality content for \
                the same source url
        """

        data = self._get_data(match.groupdict()['id'])
        content_list = []
        for url_type in self._content_urls:
            if url_type in data:
                content_list.append(Content(
                    uid=f'{self.name}-{data["id"]}-{url_type}',
                    source=source,
                    fragments=[data[url_type]],
                    extractor=self,
                    extension=data[url_type].split('.')[-1],
                    title=data.get('title'),
                    description=data.get('description'),
                    quality=self._quality_map.get(url_type, 0.0),
                    uploaded_by=data.get('account_id'),
                    uploaded_date=datetime.datetime.fromtimestamp(
                        int(data.get('datetime'))
                    ),
                    metadata=data
                ))
        yield content_list

[docs]    def handle_album(
        self, source: str, match: Match
    ) -> Generator[List[Content], None, None]:
        """ Handles ``album`` links to imgur media.

        Args:
            source (str): The source url
            match (Match): The source match regex

        Yields:
            list[Content]: A list of various levels of quality content for \
                the same source url
        """

        try:
            data = self._get_data(match.groupdict()['id'], is_album=True)
            for image in data.get('images', []):
                content_list = []
                for url_type in self._content_urls:
                    if url_type in image:
                        content_list.append(Content(
                            uid=f'{self.name}-{image["id"]}-{url_type}',
                            source=source,
                            fragments=[image[url_type]],
                            extractor=self,
                            extension=image[url_type].split('.')[-1],
                            title=data.get('title'),
                            description=data.get('description'),
                            quality=self._quality_map.get(url_type, 0.0),
                            uploaded_by=data.get('account_id'),
                            uploaded_date=datetime.datetime.fromtimestamp(
                                int(data.get('datetime'))
                            ),
                            metadata=image
                        ))
                yield content_list
        except exceptions.ExtractionError as exc:
            for content_list in self.handle_basic(source, match):
                yield content_list

[docs]    def handle_raw(
        self, source: str, match: Match
    ) -> Generator[List[Content], None, None]:
        """ Handles ``raw`` links to imgur media.

        Args:
            source (str): The source url
            match (Match): The source match regex

        Yields:
            list[Content]: A list of various levels of quality content for \
                the same source url
        """

        data = self._get_data(match.groupdict()['id'], is_raw=True)
        content_list = []
        for url_type in self._content_urls:
            if url_type in data:
                content_list.append(Content(
                    uid=f'{self.name}-{data["id"]}-{url_type}',
                    source=source,
                    fragments=[data[url_type]],
                    extractor=self,
                    extension=data[url_type].split('.')[-1],
                    title=data.get('title'),
                    description=data.get('description'),
                    quality=self._quality_map.get(url_type, 0.0),
                    uploaded_by=data.get('account_id'),
                    uploaded_date=datetime.datetime.fromtimestamp(
                        int(data.get('timestamp'))
                    ),
                    meatdata=data
                ))
        yield content_list

[docs]    def authenticate(self, auth: Tuple[str, str]):
        """ Handles authenticating the extractor if necessary.

        Args:
            auth (tuple[str, str]): The authentication tuple is available.
        """

        self.session.headers.update({
            'authorization': f'Client-ID {auth[0]}'
        })
Source code for qetch.extractors.imgur

Table Of Contents

About

Useful Links

Author's Projects

Related Topics