Source code for qetch.extractors.fourchan

# Copyright (c) 2018 Stephen Bunn (stephen@bunn.io)
# MIT License <https://opensource.org/licenses/MIT>

import datetime
from typing import (Any, Tuple, List, Dict, Match, Generator,)

from .. import (exceptions,)
from ..auth import (AuthTypes,)
from ..content import (Content,)
from ._common import (BaseExtractor,)

import bs4
import furl
import ujson


[docs]class FourChanExtractor(BaseExtractor):
    """ The extractor for links to media from ``4chan.org``.
    """

    name = '4chan'
    description = ('A no-limits and lightly categorized temporary image host.')
    authentication = AuthTypes.NONE
    domains = ['4chan.org', 'i.4chan.org']
    handles = {
        'thread': (
            r'^https?://(?:www\.)?(?:boards\.)?4chan\.org/(?P<board>.*)/'
            r'thread/(?P<id>.*)/?.*$'
        ),
        'raw': (
            r'^https?://(?:www\.)?i\.4cdn\.org/(?P<board>.*)/'
            r'(?P<id>.*)\.(?:[a-zA-Z0-9]+)$'
        )
    }

    _api_base = 'https://a.4cdn.org/'
    _img_base = 'https://i.4cdn.org/'
    _content_configs = [
        (
            None,
            '{board}/{post[tim]}{post[ext]}',
            1.0,
            None,
        ), (
            'thumb',
            '{board}/{post[tim]}s.jpg',
            0.0,
            'jpg',
        ),
    ]

    def _get_data(self, board: str, id: str) -> Dict[str, Any]:
        """ Gets API data for a specific 4chan board and thread id.

        Args:
            board (str): The id of the passed board
            id (str): The id of the passed thread

        Raises:
            exceptions.ExtractionError: When API call results in non 200 status

        Returns:
            dict[str,....]: API data dictionary response
        """

        query_url = furl.furl(self._api_base).add(
            path=f'{board}/thread/{id}.json'
        )
        response = self.session.get(query_url.url)
        if response.status_code not in (200,):
            raise exceptions.ExtractionError((
                f"error retrieving source for {query_url.url!r} "
                f"recieved status {response.status_code}"
            ))
        return ujson.loads(response.text)

[docs]    def handle_thread(
        self, source: str, match: Match
    ) -> Generator[List[Content], None, None]:
        """ Handles ``thread`` links to 4chan media.

        Args:
            source (str): The source url
            match (Match): The source match regex

        Yields:
            list[Content]: A list of various levels of quality content for \
                the same source url
        """

        matchdict = match.groupdict()
        data = self._get_data(matchdict['board'], matchdict['id'])
        for post in data.get('posts', []):
            if 'md5' in post:
                content_list = []
                for (post_type, url_path, quality, extension_type,) in \
                        self._content_configs:
                    # build post_type depending on existing post_type
                    post_type = (post_type if post_type else '')
                    content_uid = (
                        f'{self.name}-{matchdict["board"]}-'
                        f'{matchdict["id"]}-{post["tim"]}-{post_type}'
                    )
                    content_fragments = [
                        furl.furl(self._img_base).add(path=url_path.format(
                            board=matchdict['board'],
                            post=post
                        )).url
                    ]
                    content_extension = (
                        extension_type
                        if extension_type else
                        post['ext'].split('.')[-1]
                    )

                    content_list.append(Content(
                        uid=content_uid,
                        source=source,
                        fragments=content_fragments,
                        extractor=self,
                        extension=content_extension,
                        title=post.get('filename'),
                        description=bs4.BeautifulSoup(
                            post.get('com', ''),
                            'lxml'
                        ).text,
                        quality=quality,
                        uploaded_by=post.get('name'),
                        uploaded_date=datetime.datetime.fromtimestamp(
                            int(post.get('time'))
                        ),
                        metadata=post
                    ))
                yield content_list

[docs]    def handle_raw(
        self, source: str, match: Match
    ) -> Generator[List[Content], None, None]:
        """ Handles ``raw`` links to 4chan media.

        Args:
            source (str): The source url
            match (Match): The source match regex

        Yields:
            list[Content]: A list of various levels of quality content for \
                the same source url
        """

        matchdict = match.groupdict()
        content_list = []
        for (post_type, url_path, quality,) in self._content_configs:
            content_list.append(Content(
                uid=(
                    f'{self.name}-{matchdict["board"]}-raw-{matchdict["id"]}'
                    f'{post_type}'
                ),
                source=source,
                fragments=[source],
                extractor=self,
                title=None,
                description=None,
                quality=quality,
                uploaded_by=None,
                uploaded_date=None,
                metadata=None
            ))
        yield content_list
Source code for qetch.extractors.fourchan

Table Of Contents

About

Useful Links

Author's Projects

Related Topics