# Add x/html serialization to `Elementree` # Taken from ElementTree 1.3 preview with slight modifications # # Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved. # # fredrik@pythonware.com # https://www.pythonware.com/ # # -------------------------------------------------------------------- # The ElementTree toolkit is # # Copyright (c) 1999-2007 by Fredrik Lundh # # By obtaining, using, and/or copying this software and/or its # associated documentation, you agree that you have read, understood, # and will comply with the following terms and conditions: # # Permission to use, copy, modify, and distribute this software and # its associated documentation for any purpose and without fee is # hereby granted, provided that the above copyright notice appears in # all copies, and that both that copyright notice and this permission # notice appear in supporting documentation, and that the name of # Secret Labs AB or the author not be used in advertising or publicity # pertaining to distribution of the software without specific, written # prior permission. # # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE # OF THIS SOFTWARE. # -------------------------------------------------------------------- """ Python-Markdown provides two serializers which render [`ElementTree.Element`][xml.etree.ElementTree.Element] objects to a string of HTML. Both functions wrap the same underlying code with only a few minor differences as outlined below: 1. Empty (self-closing) tags are rendered as `` for HTML and as `` for XHTML. 2. Boolean attributes are rendered as `attrname` for HTML and as `attrname="attrname"` for XHTML. """ from __future__ import annotations from xml.etree.ElementTree import ProcessingInstruction from xml.etree.ElementTree import Comment, ElementTree, Element, QName, HTML_EMPTY import re from typing import Callable, Literal, NoReturn __all__ = ['to_html_string', 'to_xhtml_string'] RE_AMP = re.compile(r'&(?!(?:\#[0-9]+|\#x[0-9a-f]+|[0-9a-z]+);)', re.I) def _raise_serialization_error(text: str) -> NoReturn: # pragma: no cover raise TypeError( "cannot serialize {!r} (type {})".format(text, type(text).__name__) ) def _escape_cdata(text) -> str: # escape character data try: # it's worth avoiding do-nothing calls for strings that are # shorter than 500 character, or so. assume that's, by far, # the most common case in most applications. if "&" in text: # Only replace & when not part of an entity text = RE_AMP.sub('&', text) if "<" in text: text = text.replace("<", "<") if ">" in text: text = text.replace(">", ">") return text except (TypeError, AttributeError): # pragma: no cover _raise_serialization_error(text) def _escape_attrib(text: str) -> str: # escape attribute value try: if "&" in text: # Only replace & when not part of an entity text = RE_AMP.sub('&', text) if "<" in text: text = text.replace("<", "<") if ">" in text: text = text.replace(">", ">") if "\"" in text: text = text.replace("\"", """) if "\n" in text: text = text.replace("\n", " ") return text except (TypeError, AttributeError): # pragma: no cover _raise_serialization_error(text) def _escape_attrib_html(text: str) -> str: # escape attribute value try: if "&" in text: # Only replace & when not part of an entity text = RE_AMP.sub('&', text) if "<" in text: text = text.replace("<", "<") if ">" in text: text = text.replace(">", ">") if "\"" in text: text = text.replace("\"", """) return text except (TypeError, AttributeError): # pragma: no cover _raise_serialization_error(text) def _serialize_html(write: Callable[[str], None], elem: Element, format: Literal["html", "xhtml"]) -> None: tag = elem.tag text = elem.text if tag is Comment: write("" % _escape_cdata(text)) elif tag is ProcessingInstruction: write("" % _escape_cdata(text)) elif tag is None: if text: write(_escape_cdata(text)) for e in elem: _serialize_html(write, e, format) else: namespace_uri = None if isinstance(tag, QName): # `QNAME` objects store their data as a string: `{uri}tag` if tag.text[:1] == "{": namespace_uri, tag = tag.text[1:].split("}", 1) else: raise ValueError('QName objects must define a tag.') write("<" + tag) items = elem.items() if items: items = sorted(items) # lexical order for k, v in items: if isinstance(k, QName): # Assume a text only `QName` k = k.text if isinstance(v, QName): # Assume a text only `QName` v = v.text else: v = _escape_attrib_html(v) if k == v and format == 'html': # handle boolean attributes write(" %s" % v) else: write(' {}="{}"'.format(k, v)) if namespace_uri: write(' xmlns="%s"' % (_escape_attrib(namespace_uri))) if format == "xhtml" and tag.lower() in HTML_EMPTY: write(" />") else: write(">") if text: if tag.lower() in ["script", "style"]: write(text) else: write(_escape_cdata(text)) for e in elem: _serialize_html(write, e, format) if tag.lower() not in HTML_EMPTY: write("") if elem.tail: write(_escape_cdata(elem.tail)) def _write_html(root: Element, format: Literal["html", "xhtml"] = "html") -> str: assert root is not None data: list[str] = [] write = data.append _serialize_html(write, root, format) return "".join(data) # -------------------------------------------------------------------- # public functions def to_html_string(element: Element) -> str: """ Serialize element and its children to a string of HTML5. """ return _write_html(ElementTree(element).getroot(), format="html") def to_xhtml_string(element: Element) -> str: """ Serialize element and its children to a string of XHTML. """ return _write_html(ElementTree(element).getroot(), format="xhtml")