Source code for qetch.extractors.fourchan
# Copyright (c) 2018 Stephen Bunn (stephen@bunn.io)
# MIT License <https://opensource.org/licenses/MIT>
import datetime
from typing import (Any, Tuple, List, Dict, Match, Generator,)
from .. import (exceptions,)
from ..auth import (AuthTypes,)
from ..content import (Content,)
from ._common import (BaseExtractor,)
import bs4
import furl
import ujson
[docs]class FourChanExtractor(BaseExtractor):
""" The extractor for links to media from ``4chan.org``.
"""
name = '4chan'
description = ('A no-limits and lightly categorized temporary image host.')
authentication = AuthTypes.NONE
domains = ['4chan.org', 'i.4chan.org']
handles = {
'thread': (
r'^https?://(?:www\.)?(?:boards\.)?4chan\.org/(?P<board>.*)/'
r'thread/(?P<id>.*)/?.*$'
),
'raw': (
r'^https?://(?:www\.)?i\.4cdn\.org/(?P<board>.*)/'
r'(?P<id>.*)\.(?:[a-zA-Z0-9]+)$'
)
}
_api_base = 'https://a.4cdn.org/'
_img_base = 'https://i.4cdn.org/'
_content_configs = [
(
None,
'{board}/{post[tim]}{post[ext]}',
1.0,
None,
), (
'thumb',
'{board}/{post[tim]}s.jpg',
0.0,
'jpg',
),
]
def _get_data(self, board: str, id: str) -> Dict[str, Any]:
""" Gets API data for a specific 4chan board and thread id.
Args:
board (str): The id of the passed board
id (str): The id of the passed thread
Raises:
exceptions.ExtractionError: When API call results in non 200 status
Returns:
dict[str,....]: API data dictionary response
"""
query_url = furl.furl(self._api_base).add(
path=f'{board}/thread/{id}.json'
)
response = self.session.get(query_url.url)
if response.status_code not in (200,):
raise exceptions.ExtractionError((
f"error retrieving source for {query_url.url!r} "
f"recieved status {response.status_code}"
))
return ujson.loads(response.text)
[docs] def handle_thread(
self, source: str, match: Match
) -> Generator[List[Content], None, None]:
""" Handles ``thread`` links to 4chan media.
Args:
source (str): The source url
match (Match): The source match regex
Yields:
list[Content]: A list of various levels of quality content for \
the same source url
"""
matchdict = match.groupdict()
data = self._get_data(matchdict['board'], matchdict['id'])
for post in data.get('posts', []):
if 'md5' in post:
content_list = []
for (post_type, url_path, quality, extension_type,) in \
self._content_configs:
# build post_type depending on existing post_type
post_type = (post_type if post_type else '')
content_uid = (
f'{self.name}-{matchdict["board"]}-'
f'{matchdict["id"]}-{post["tim"]}-{post_type}'
)
content_fragments = [
furl.furl(self._img_base).add(path=url_path.format(
board=matchdict['board'],
post=post
)).url
]
content_extension = (
extension_type
if extension_type else
post['ext'].split('.')[-1]
)
content_list.append(Content(
uid=content_uid,
source=source,
fragments=content_fragments,
extractor=self,
extension=content_extension,
title=post.get('filename'),
description=bs4.BeautifulSoup(
post.get('com', ''),
'lxml'
).text,
quality=quality,
uploaded_by=post.get('name'),
uploaded_date=datetime.datetime.fromtimestamp(
int(post.get('time'))
),
metadata=post
))
yield content_list
[docs] def handle_raw(
self, source: str, match: Match
) -> Generator[List[Content], None, None]:
""" Handles ``raw`` links to 4chan media.
Args:
source (str): The source url
match (Match): The source match regex
Yields:
list[Content]: A list of various levels of quality content for \
the same source url
"""
matchdict = match.groupdict()
content_list = []
for (post_type, url_path, quality,) in self._content_configs:
content_list.append(Content(
uid=(
f'{self.name}-{matchdict["board"]}-raw-{matchdict["id"]}'
f'{post_type}'
),
source=source,
fragments=[source],
extractor=self,
title=None,
description=None,
quality=quality,
uploaded_by=None,
uploaded_date=None,
metadata=None
))
yield content_list