Source code for nefelibata.assistants.warn_external_resources

import logging
import urllib.parse
from pathlib import Path

import tinycss2
from bs4 import BeautifulSoup
from nefelibata.assistants import Assistant
from nefelibata.assistants import Scope
from nefelibata.post import Post

_logger = logging.getLogger(__name__)


[docs]class WarnExternalResourcesAssistant(Assistant): scopes = [Scope.POST, Scope.SITE]
[docs] def process_post(self, post: Post, force: bool = False) -> None: self._process_file(post.file_path.with_suffix(".html"))
[docs] def process_site(self, force: bool = False) -> None: for path in (self.root / "build").glob("*.html"): self._process_file(path)
def _safe(self, resource: str): # any URL that starts with the blog URL is safe if resource.startswith(self.config["url"]): return True # if the blog uses an external endpoint for webmention that's ok if ( "webmention" in self.config and resource == self.config["webmention"]["endpoint"] ): return True return False def _process_file(self, file_path: Path) -> None: with open(file_path) as fp: html = fp.read() tag_attributes = [ ("img", "src"), ("link", "href"), ("script", "src"), ] soup = BeautifulSoup(html, "html.parser") for tag, attr in tag_attributes: for el in soup.find_all(tag): resource = el.attrs.get(attr) if not resource: continue if "://" in resource and not self._safe(resource): _logger.warning(f"External resource found: {resource}") if resource.endswith(".css"): self._check_css(resource, file_path) def _check_css(self, resource: str, file_path: Path) -> None: """Check CSS for external URLs.""" if "://" in resource and not resource.startswith(self.config["url"]): # external CSS, should've been already flagged return # this should be "posts", when `process_post` is called, or "build", # when `process_site` is called base_dir = file_path.relative_to(self.root).parts[0] if resource.startswith("/"): css_path = self.root / base_dir / resource[1:] else: css_path = file_path.parent / resource if not css_path.exists(): return with open(css_path) as fp: css = fp.read() stylesheet = tinycss2.parse_stylesheet( css, skip_comments=True, skip_whitespace=True, ) for rule in stylesheet: for token in rule.content: if ( isinstance(token, tinycss2.ast.URLToken) and "://" in token.value and not self._safe(token.value) ): _logger.warning( f"External resource found in {css_path}: {token.value}", )