You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
477 lines
17 KiB
Python
477 lines
17 KiB
Python
2 months ago
|
# Python Markdown
|
||
|
|
||
|
# A Python implementation of John Gruber's Markdown.
|
||
|
|
||
|
# Documentation: https://python-markdown.github.io/
|
||
|
# GitHub: https://github.com/Python-Markdown/markdown/
|
||
|
# PyPI: https://pypi.org/project/Markdown/
|
||
|
|
||
|
# Started by Manfred Stienstra (http://www.dwerg.net/).
|
||
|
# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
|
||
|
# Currently maintained by Waylan Limberg (https://github.com/waylan),
|
||
|
# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
|
||
|
|
||
|
# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later)
|
||
|
# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
|
||
|
# Copyright 2004 Manfred Stienstra (the original version)
|
||
|
|
||
|
# License: BSD (see LICENSE.md for details).
|
||
|
|
||
|
"""
|
||
|
Tree processors manipulate the tree created by block processors. They can even create an entirely
|
||
|
new `ElementTree` object. This is an excellent place for creating summaries, adding collected
|
||
|
references, or last minute adjustments.
|
||
|
|
||
|
"""
|
||
|
|
||
|
from __future__ import annotations
|
||
|
|
||
|
import re
|
||
|
import xml.etree.ElementTree as etree
|
||
|
from typing import TYPE_CHECKING, Any
|
||
|
from . import util
|
||
|
from . import inlinepatterns
|
||
|
|
||
|
if TYPE_CHECKING: # pragma: no cover
|
||
|
from markdown import Markdown
|
||
|
|
||
|
|
||
|
def build_treeprocessors(md: Markdown, **kwargs: Any) -> util.Registry[Treeprocessor]:
|
||
|
""" Build the default `treeprocessors` for Markdown. """
|
||
|
treeprocessors = util.Registry()
|
||
|
treeprocessors.register(InlineProcessor(md), 'inline', 20)
|
||
|
treeprocessors.register(PrettifyTreeprocessor(md), 'prettify', 10)
|
||
|
treeprocessors.register(UnescapeTreeprocessor(md), 'unescape', 0)
|
||
|
return treeprocessors
|
||
|
|
||
|
|
||
|
def isString(s: object) -> bool:
|
||
|
""" Return `True` if object is a string but not an [`AtomicString`][markdown.util.AtomicString]. """
|
||
|
if not isinstance(s, util.AtomicString):
|
||
|
return isinstance(s, str)
|
||
|
return False
|
||
|
|
||
|
|
||
|
class Treeprocessor(util.Processor):
|
||
|
"""
|
||
|
`Treeprocessor`s are run on the `ElementTree` object before serialization.
|
||
|
|
||
|
Each `Treeprocessor` implements a `run` method that takes a pointer to an
|
||
|
`Element` and modifies it as necessary.
|
||
|
|
||
|
`Treeprocessors` must extend `markdown.Treeprocessor`.
|
||
|
|
||
|
"""
|
||
|
def run(self, root: etree.Element) -> etree.Element | None:
|
||
|
"""
|
||
|
Subclasses of `Treeprocessor` should implement a `run` method, which
|
||
|
takes a root `Element`. This method can return another `Element`
|
||
|
object, and the existing root `Element` will be replaced, or it can
|
||
|
modify the current tree and return `None`.
|
||
|
"""
|
||
|
pass # pragma: no cover
|
||
|
|
||
|
|
||
|
class InlineProcessor(Treeprocessor):
|
||
|
"""
|
||
|
A `Treeprocessor` that traverses a tree, applying inline patterns.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, md: Markdown):
|
||
|
self.__placeholder_prefix = util.INLINE_PLACEHOLDER_PREFIX
|
||
|
self.__placeholder_suffix = util.ETX
|
||
|
self.__placeholder_length = 4 + len(self.__placeholder_prefix) \
|
||
|
+ len(self.__placeholder_suffix)
|
||
|
self.__placeholder_re = util.INLINE_PLACEHOLDER_RE
|
||
|
self.md = md
|
||
|
self.inlinePatterns = md.inlinePatterns
|
||
|
self.ancestors: list[str] = []
|
||
|
|
||
|
def __makePlaceholder(self, type: str) -> tuple[str, str]:
|
||
|
""" Generate a placeholder """
|
||
|
id = "%04d" % len(self.stashed_nodes)
|
||
|
hash = util.INLINE_PLACEHOLDER % id
|
||
|
return hash, id
|
||
|
|
||
|
def __findPlaceholder(self, data: str, index: int) -> tuple[str | None, int]:
|
||
|
"""
|
||
|
Extract id from data string, start from index.
|
||
|
|
||
|
Arguments:
|
||
|
data: String.
|
||
|
index: Index, from which we start search.
|
||
|
|
||
|
Returns:
|
||
|
Placeholder id and string index, after the found placeholder.
|
||
|
|
||
|
"""
|
||
|
m = self.__placeholder_re.search(data, index)
|
||
|
if m:
|
||
|
return m.group(1), m.end()
|
||
|
else:
|
||
|
return None, index + 1
|
||
|
|
||
|
def __stashNode(self, node: etree.Element | str, type: str) -> str:
|
||
|
""" Add node to stash. """
|
||
|
placeholder, id = self.__makePlaceholder(type)
|
||
|
self.stashed_nodes[id] = node
|
||
|
return placeholder
|
||
|
|
||
|
def __handleInline(self, data: str, patternIndex: int = 0) -> str:
|
||
|
"""
|
||
|
Process string with inline patterns and replace it with placeholders.
|
||
|
|
||
|
Arguments:
|
||
|
data: A line of Markdown text.
|
||
|
patternIndex: The index of the `inlinePattern` to start with.
|
||
|
|
||
|
Returns:
|
||
|
String with placeholders.
|
||
|
|
||
|
"""
|
||
|
if not isinstance(data, util.AtomicString):
|
||
|
startIndex = 0
|
||
|
count = len(self.inlinePatterns)
|
||
|
while patternIndex < count:
|
||
|
data, matched, startIndex = self.__applyPattern(
|
||
|
self.inlinePatterns[patternIndex], data, patternIndex, startIndex
|
||
|
)
|
||
|
if not matched:
|
||
|
patternIndex += 1
|
||
|
return data
|
||
|
|
||
|
def __processElementText(self, node: etree.Element, subnode: etree.Element, isText: bool = True) -> None:
|
||
|
"""
|
||
|
Process placeholders in `Element.text` or `Element.tail`
|
||
|
of Elements popped from `self.stashed_nodes`.
|
||
|
|
||
|
Arguments:
|
||
|
node: Parent node.
|
||
|
subnode: Processing node.
|
||
|
isText: Boolean variable, True - it's text, False - it's a tail.
|
||
|
|
||
|
"""
|
||
|
if isText:
|
||
|
text = subnode.text
|
||
|
subnode.text = None
|
||
|
else:
|
||
|
text = subnode.tail
|
||
|
subnode.tail = None
|
||
|
|
||
|
childResult = self.__processPlaceholders(text, subnode, isText)
|
||
|
|
||
|
if not isText and node is not subnode:
|
||
|
pos = list(node).index(subnode) + 1
|
||
|
else:
|
||
|
pos = 0
|
||
|
|
||
|
childResult.reverse()
|
||
|
for newChild in childResult:
|
||
|
node.insert(pos, newChild[0])
|
||
|
|
||
|
def __processPlaceholders(
|
||
|
self,
|
||
|
data: str | None,
|
||
|
parent: etree.Element,
|
||
|
isText: bool = True
|
||
|
) -> list[tuple[etree.Element, list[str]]]:
|
||
|
"""
|
||
|
Process string with placeholders and generate `ElementTree` tree.
|
||
|
|
||
|
Arguments:
|
||
|
data: String with placeholders instead of `ElementTree` elements.
|
||
|
parent: Element, which contains processing inline data.
|
||
|
isText: Boolean variable, True - it's text, False - it's a tail.
|
||
|
|
||
|
Returns:
|
||
|
List with `ElementTree` elements with applied inline patterns.
|
||
|
|
||
|
"""
|
||
|
def linkText(text: str | None) -> None:
|
||
|
if text:
|
||
|
if result:
|
||
|
if result[-1][0].tail:
|
||
|
result[-1][0].tail += text
|
||
|
else:
|
||
|
result[-1][0].tail = text
|
||
|
elif not isText:
|
||
|
if parent.tail:
|
||
|
parent.tail += text
|
||
|
else:
|
||
|
parent.tail = text
|
||
|
else:
|
||
|
if parent.text:
|
||
|
parent.text += text
|
||
|
else:
|
||
|
parent.text = text
|
||
|
result = []
|
||
|
strartIndex = 0
|
||
|
while data:
|
||
|
index = data.find(self.__placeholder_prefix, strartIndex)
|
||
|
if index != -1:
|
||
|
id, phEndIndex = self.__findPlaceholder(data, index)
|
||
|
|
||
|
if id in self.stashed_nodes:
|
||
|
node = self.stashed_nodes.get(id)
|
||
|
|
||
|
if index > 0:
|
||
|
text = data[strartIndex:index]
|
||
|
linkText(text)
|
||
|
|
||
|
if not isinstance(node, str): # it's Element
|
||
|
for child in [node] + list(node):
|
||
|
if child.tail:
|
||
|
if child.tail.strip():
|
||
|
self.__processElementText(
|
||
|
node, child, False
|
||
|
)
|
||
|
if child.text:
|
||
|
if child.text.strip():
|
||
|
self.__processElementText(child, child)
|
||
|
else: # it's just a string
|
||
|
linkText(node)
|
||
|
strartIndex = phEndIndex
|
||
|
continue
|
||
|
|
||
|
strartIndex = phEndIndex
|
||
|
result.append((node, self.ancestors[:]))
|
||
|
|
||
|
else: # wrong placeholder
|
||
|
end = index + len(self.__placeholder_prefix)
|
||
|
linkText(data[strartIndex:end])
|
||
|
strartIndex = end
|
||
|
else:
|
||
|
text = data[strartIndex:]
|
||
|
if isinstance(data, util.AtomicString):
|
||
|
# We don't want to loose the `AtomicString`
|
||
|
text = util.AtomicString(text)
|
||
|
linkText(text)
|
||
|
data = ""
|
||
|
|
||
|
return result
|
||
|
|
||
|
def __applyPattern(
|
||
|
self,
|
||
|
pattern: inlinepatterns.Pattern,
|
||
|
data: str,
|
||
|
patternIndex: int,
|
||
|
startIndex: int = 0
|
||
|
) -> tuple[str, bool, int]:
|
||
|
"""
|
||
|
Check if the line fits the pattern, create the necessary
|
||
|
elements, add it to `stashed_nodes`.
|
||
|
|
||
|
Arguments:
|
||
|
data: The text to be processed.
|
||
|
pattern: The pattern to be checked.
|
||
|
patternIndex: Index of current pattern.
|
||
|
startIndex: String index, from which we start searching.
|
||
|
|
||
|
Returns:
|
||
|
String with placeholders instead of `ElementTree` elements.
|
||
|
|
||
|
"""
|
||
|
new_style = isinstance(pattern, inlinepatterns.InlineProcessor)
|
||
|
|
||
|
for exclude in pattern.ANCESTOR_EXCLUDES:
|
||
|
if exclude.lower() in self.ancestors:
|
||
|
return data, False, 0
|
||
|
|
||
|
if new_style:
|
||
|
match = None
|
||
|
# Since `handleMatch` may reject our first match,
|
||
|
# we iterate over the buffer looking for matches
|
||
|
# until we can't find any more.
|
||
|
for match in pattern.getCompiledRegExp().finditer(data, startIndex):
|
||
|
node, start, end = pattern.handleMatch(match, data)
|
||
|
if start is None or end is None:
|
||
|
startIndex += match.end(0)
|
||
|
match = None
|
||
|
continue
|
||
|
break
|
||
|
else: # pragma: no cover
|
||
|
match = pattern.getCompiledRegExp().match(data[startIndex:])
|
||
|
leftData = data[:startIndex]
|
||
|
|
||
|
if not match:
|
||
|
return data, False, 0
|
||
|
|
||
|
if not new_style: # pragma: no cover
|
||
|
node = pattern.handleMatch(match)
|
||
|
start = match.start(0)
|
||
|
end = match.end(0)
|
||
|
|
||
|
if node is None:
|
||
|
return data, True, end
|
||
|
|
||
|
if not isinstance(node, str):
|
||
|
if not isinstance(node.text, util.AtomicString):
|
||
|
# We need to process current node too
|
||
|
for child in [node] + list(node):
|
||
|
if not isString(node):
|
||
|
if child.text:
|
||
|
self.ancestors.append(child.tag.lower())
|
||
|
child.text = self.__handleInline(
|
||
|
child.text, patternIndex + 1
|
||
|
)
|
||
|
self.ancestors.pop()
|
||
|
if child.tail:
|
||
|
child.tail = self.__handleInline(
|
||
|
child.tail, patternIndex
|
||
|
)
|
||
|
|
||
|
placeholder = self.__stashNode(node, pattern.type())
|
||
|
|
||
|
if new_style:
|
||
|
return "{}{}{}".format(data[:start],
|
||
|
placeholder, data[end:]), True, 0
|
||
|
else: # pragma: no cover
|
||
|
return "{}{}{}{}".format(leftData,
|
||
|
match.group(1),
|
||
|
placeholder, match.groups()[-1]), True, 0
|
||
|
|
||
|
def __build_ancestors(self, parent: etree.Element | None, parents: list[str]) -> None:
|
||
|
"""Build the ancestor list."""
|
||
|
ancestors = []
|
||
|
while parent is not None:
|
||
|
if parent is not None:
|
||
|
ancestors.append(parent.tag.lower())
|
||
|
parent = self.parent_map.get(parent)
|
||
|
ancestors.reverse()
|
||
|
parents.extend(ancestors)
|
||
|
|
||
|
def run(self, tree: etree.Element, ancestors: list[str] | None = None) -> etree.Element:
|
||
|
"""Apply inline patterns to a parsed Markdown tree.
|
||
|
|
||
|
Iterate over `Element`, find elements with inline tag, apply inline
|
||
|
patterns and append newly created Elements to tree. To avoid further
|
||
|
processing of string with inline patterns, instead of normal string,
|
||
|
use subclass [`AtomicString`][markdown.util.AtomicString]:
|
||
|
|
||
|
node.text = markdown.util.AtomicString("This will not be processed.")
|
||
|
|
||
|
Arguments:
|
||
|
tree: `Element` object, representing Markdown tree.
|
||
|
ancestors: List of parent tag names that precede the tree node (if needed).
|
||
|
|
||
|
Returns:
|
||
|
An element tree object with applied inline patterns.
|
||
|
|
||
|
"""
|
||
|
self.stashed_nodes: dict[str, etree.Element | str] = {}
|
||
|
|
||
|
# Ensure a valid parent list, but copy passed in lists
|
||
|
# to ensure we don't have the user accidentally change it on us.
|
||
|
tree_parents = [] if ancestors is None else ancestors[:]
|
||
|
|
||
|
self.parent_map = {c: p for p in tree.iter() for c in p}
|
||
|
stack = [(tree, tree_parents)]
|
||
|
|
||
|
while stack:
|
||
|
currElement, parents = stack.pop()
|
||
|
|
||
|
self.ancestors = parents
|
||
|
self.__build_ancestors(currElement, self.ancestors)
|
||
|
|
||
|
insertQueue = []
|
||
|
for child in currElement:
|
||
|
if child.text and not isinstance(
|
||
|
child.text, util.AtomicString
|
||
|
):
|
||
|
self.ancestors.append(child.tag.lower())
|
||
|
text = child.text
|
||
|
child.text = None
|
||
|
lst = self.__processPlaceholders(
|
||
|
self.__handleInline(text), child
|
||
|
)
|
||
|
for item in lst:
|
||
|
self.parent_map[item[0]] = child
|
||
|
stack += lst
|
||
|
insertQueue.append((child, lst))
|
||
|
self.ancestors.pop()
|
||
|
if child.tail:
|
||
|
tail = self.__handleInline(child.tail)
|
||
|
dumby = etree.Element('d')
|
||
|
child.tail = None
|
||
|
tailResult = self.__processPlaceholders(tail, dumby, False)
|
||
|
if dumby.tail:
|
||
|
child.tail = dumby.tail
|
||
|
pos = list(currElement).index(child) + 1
|
||
|
tailResult.reverse()
|
||
|
for newChild in tailResult:
|
||
|
self.parent_map[newChild[0]] = currElement
|
||
|
currElement.insert(pos, newChild[0])
|
||
|
if len(child):
|
||
|
self.parent_map[child] = currElement
|
||
|
stack.append((child, self.ancestors[:]))
|
||
|
|
||
|
for element, lst in insertQueue:
|
||
|
for i, obj in enumerate(lst):
|
||
|
newChild = obj[0]
|
||
|
element.insert(i, newChild)
|
||
|
return tree
|
||
|
|
||
|
|
||
|
class PrettifyTreeprocessor(Treeprocessor):
|
||
|
""" Add line breaks to the html document. """
|
||
|
|
||
|
def _prettifyETree(self, elem: etree.Element) -> None:
|
||
|
""" Recursively add line breaks to `ElementTree` children. """
|
||
|
|
||
|
i = "\n"
|
||
|
if self.md.is_block_level(elem.tag) and elem.tag not in ['code', 'pre']:
|
||
|
if (not elem.text or not elem.text.strip()) \
|
||
|
and len(elem) and self.md.is_block_level(elem[0].tag):
|
||
|
elem.text = i
|
||
|
for e in elem:
|
||
|
if self.md.is_block_level(e.tag):
|
||
|
self._prettifyETree(e)
|
||
|
if not elem.tail or not elem.tail.strip():
|
||
|
elem.tail = i
|
||
|
|
||
|
def run(self, root: etree.Element) -> None:
|
||
|
""" Add line breaks to `Element` object and its children. """
|
||
|
|
||
|
self._prettifyETree(root)
|
||
|
# Do `<br />`'s separately as they are often in the middle of
|
||
|
# inline content and missed by `_prettifyETree`.
|
||
|
brs = root.iter('br')
|
||
|
for br in brs:
|
||
|
if not br.tail or not br.tail.strip():
|
||
|
br.tail = '\n'
|
||
|
else:
|
||
|
br.tail = '\n%s' % br.tail
|
||
|
# Clean up extra empty lines at end of code blocks.
|
||
|
pres = root.iter('pre')
|
||
|
for pre in pres:
|
||
|
if len(pre) and pre[0].tag == 'code':
|
||
|
code = pre[0]
|
||
|
# Only prettify code containing text only
|
||
|
if not len(code) and code.text is not None:
|
||
|
code.text = util.AtomicString(code.text.rstrip() + '\n')
|
||
|
|
||
|
|
||
|
class UnescapeTreeprocessor(Treeprocessor):
|
||
|
""" Restore escaped chars """
|
||
|
|
||
|
RE = re.compile(r'{}(\d+){}'.format(util.STX, util.ETX))
|
||
|
|
||
|
def _unescape(self, m: re.Match[str]) -> str:
|
||
|
return chr(int(m.group(1)))
|
||
|
|
||
|
def unescape(self, text: str) -> str:
|
||
|
return self.RE.sub(self._unescape, text)
|
||
|
|
||
|
def run(self, root: etree.Element) -> None:
|
||
|
""" Loop over all elements and unescape all text. """
|
||
|
for elem in root.iter():
|
||
|
# Unescape text content
|
||
|
if elem.text and not elem.tag == 'code':
|
||
|
elem.text = self.unescape(elem.text)
|
||
|
# Unescape tail content
|
||
|
if elem.tail:
|
||
|
elem.tail = self.unescape(elem.tail)
|
||
|
# Unescape attribute values
|
||
|
for key, value in elem.items():
|
||
|
elem.set(key, self.unescape(value))
|