Source code for qetch.content

# Copyright (c) 2018 Stephen Bunn (stephen@bunn.io)
# MIT License <https://opensource.org/licenses/MIT>

import datetime
from typing import (Any, List, Dict,)

from .extractors._common import (BaseExtractor,)

import furl


[docs]class Content(object): """ The resulting content instance yielded by extractors. """ def __init__( self, uid: str, source: str, fragments: List[str], extractor: BaseExtractor, extension: str=None, title: str=None, description: str=None, quality: float=0.0, uploaded_by: str=None, uploaded_date: datetime.datetime=None, metadata: Dict[str, Any]={} ): """ Initializes the Content instance. uid (str): The unique id for the discovered content. source (str): The source url given to the extractor. fragments (list[str]): A list of urls which represent the raw content. extractor (BaseExtractor): The extractor which discovered the content. extension (str): The extension for the resulting file. title (str, optional): A title for the content. description (str, optional): A description for the content. quality (float, optional): A level of quality for the content in relation to other content with the same source. uploaded_by (str, optional): A string of the uploader's name. uploaded_date (datetime.datetime, optional): A datetime instance for when the content was uploaded. metadata (dict[str,....], optional): Any additional metadata about the discovered content. """ ( self.uid, self.source, self.fragments, self.extractor, self.extension, self.title, self.description, self.quality, self.uploaded_by, self.uploaded_date, self.metadata, ) = ( uid, source, fragments, extractor, extension, title, description, quality, uploaded_by, uploaded_date, metadata ) def __repr__(self): """ Returns a string representation of the content. Returns: str: The string representation of the instance. """ return ( f'<{self.__class__.__name__} ({self.quality}) ' f'"{self.uid}.{self.extension}">' ) @property def uid(self) -> str: """ The unique id of the discovered content. Returns: str: The unique id of the discovered content. """ if not hasattr(self, '_uid'): self._uid = None return self._uid @uid.setter def uid(self, uid: str): if isinstance(uid, str) and len(uid) > 0: self._uid = uid @property def source(self) -> furl.furl: """ The given source url from where the content came from. Returns: furl.furl: The given source url from where the content came from. """ if not hasattr(self, '_source'): self._source = None return self._source @source.setter def source(self, url: str): if isinstance(url, str) and len(url) > 0: self._source = furl.furl(url=url) @property def fragments(self) -> List[str]: """ A list of urls which represent the raw content. Returns: list[str]: A list of urls which represent the raw content. """ if not hasattr(self, '_fragments'): self._fragments = [] return self._fragments @fragments.setter def fragments(self, fragments: List[str]): if isinstance(fragments, list) and len(fragments) > 0 and \ all(isinstance(fragment, str) for fragment in fragments): self._fragments = fragments @property def extractor(self) -> BaseExtractor: """ The extractor which discovered the content. Returns: BaseExtractor: The extractor which discovered the content. """ if not hasattr(self, '_extractor'): self._extractor = None return self._extractor @extractor.setter def extractor(self, extractor: BaseExtractor): if isinstance(extractor, BaseExtractor): self._extractor = extractor @property def extension(self) -> str: """ The extension of the resulting content. Returns: str: The extension for the resulting content. """ if not hasattr(self, '_extension'): self._extension = None return self._extension @extension.setter def extension(self, extension: str): if isinstance(extension, str) and len(extension) > 0: self._extension = extension @property def title(self) -> str: """ The title of the content. Returns: str: The title of the content. """ if not hasattr(self, '_title'): self._title = None return self._title @title.setter def title(self, title: str): if isinstance(title, str) and len(title) > 0: self._title = title @property def description(self) -> str: """ The description of the content. Returns: str: The description of the content. """ if not hasattr(self, '_description'): self._description = None return self._description @description.setter def description(self, description: str): if isinstance(description, str) and len(description) > 0: self._description = description @property def quality(self) -> float: """ The contextual quality for the current content. Returns: float: The contextual quality for the current content. """ if not hasattr(self, '_quality'): self._quality = 0.0 return self._quality @quality.setter def quality(self, quality: float): if isinstance(quality, float) and 0.0 <= quality <= 1.0: self._quality = quality @property def uploaded_by(self) -> str: """ A string of the uploader's name. Returns: str: A string of the uploader's name. """ if not hasattr(self, '_uploaded_by'): self._uploaded_by = None return self._uploaded_by @uploaded_by.setter def uploaded_by(self, uploaded_by: str): if isinstance(uploaded_by, str) and len(uploaded_by) > 0: self._uploaded_by = uploaded_by @property def uploaded_date(self) -> datetime.datetime: """ The datetime the content was uploaded. Returns: datetime.datetime: The datetime the content was uploaded. """ if not hasattr(self, '_uploaded_date'): self._uploaded_date = None return self._uploaded_date @uploaded_date.setter def uploaded_date(self, uploaded_date: datetime.datetime): if isinstance(uploaded_date, datetime.datetime): self._uploaded_date = uploaded_date @property def metadata(self) -> Dict[str, Any]: """ Any metadata for the current content. Returns: dict[str,....]: Any metadata for the current content. """ if not hasattr(self, '_metadata'): self._metadata = {} return self._metadata @metadata.setter def metadata(self, metadata: Dict[str, Any]): if isinstance(metadata, dict): self._metadata = metadata
[docs] def get_size(self) -> int: """ Returns the sum of the length of the fragments. Returns: int: The sum of the length of the fragments. """ return sum( int(self.extractor.session.head( fragment ).headers['Content-Length']) for fragment in self.fragments
)