Source code for markup.utils

__copyright__ = "Copyright © Stichting SciPost (SciPost Foundation)"
__license__ = "AGPL v3"


import bleach
from docutils.core import publish_parts
import markdown
from io import StringIO
import re

from django.template.defaultfilters import linebreaksbr
from django.utils.encoding import force_text
from django.utils.safestring import mark_safe

from .constants import ReST_HEADER_REGEX_DICT, ReST_ROLES, ReST_DIRECTIVES, BLEACH_ALLOWED_TAGS


# Inline or displayed math
[docs]def match_inline_math(text): """Return first match object of regex search for inline math ``$...$`` or ``\(...\)``.""" match = re.search(r'\$[^$]+\$', text) if match: return match return re.search(r'\\\(.+\\\)', text)
[docs]def match_displayed_math(text): """Return first match object of regex search for displayed math ``$$...$$`` or ``\[...\]``.""" match = re.search(r'\$\$.+\$\$', text, re.DOTALL) if match: return match return re.search(r'\\\[.+\\\]', text, re.DOTALL)
# Headers
[docs]def match_md_header(text, level=None): """ Return first match object of regex search for Markdown headers in form #{level,}. If not level is given, all levels 1 to 6 are checked, returning the first match or None. """ if not level: for newlevel in range(1, 7): match = match_md_header(text, newlevel) if match: return match return None if not isinstance(level, int): raise TypeError('level must be an int') if level < 1 or level > 6: raise ValueError('level must be an integer from 1 to 6') return re.search(r'^#{' + str(level) + ',}[ ].+$', text, re.MULTILINE)
[docs]def match_rst_header(text, symbol=None): """ Return first match object of regex search for reStructuredText header. Python conventions are followed, namely that ``#`` and ``*`` headers have both over and underline (of equal length, so faulty ones are not matched), while the others (``=``, ``-``, ``"`` and ``^``) only have the underline. """ if not symbol: for newsymbol in ['#', '*', '=', '-', '"', '^']: # explicit checking order match = match_rst_header(text, newsymbol) if match: return match return None if symbol not in ReST_HEADER_REGEX_DICT.keys(): raise ValueError('symbol is not a ReST header symbol') return re.search(ReST_HEADER_REGEX_DICT[symbol], text, re.MULTILINE)
# Blockquotes
[docs]def match_md_blockquote(text): """Return first match of regex search for Markdown blockquote.""" return re.search(r'(^[ ]*>[ ].+){1,5}', text, re.DOTALL | re.MULTILINE)
# Hyperlinks # reStructuredText roles and directives
[docs]def match_rst_role(text, role=None): """ Return first match object of regex search for given ReST role :role:`... . If no role is given, all roles in ReST_ROLES are tested one by one. """ if not role: for newrole in ReST_ROLES: match = match_rst_role(text, newrole) if match: return match return None if role not in ReST_ROLES: raise ValueError('this role is not listed in ReST roles') return re.search(r':' + role + ':`.+`', text)
[docs]def match_rst_directive(text, directive=None): """ Return first match object of regex search for given ReST directive. If no directive is given, all directives in ReST_DIRECTIVES are tested one by one. The first one to three lines after the directive statement are also captured. """ if not directive: for newdirective in ReST_DIRECTIVES: match = match_rst_directive(text, newdirective) if match: return match return None if directive not in ReST_DIRECTIVES: raise ValueError('this directive is not listed in ReST directives') return re.search(r'^\.\. ' + directive + '::(.+)*(\n(.+)*){1,3}', text, re.MULTILINE)
# Lists
[docs]def match_md_unordered_list(text): """Return first match of Markdown list (excluding ReST-shared * pattern).""" return re.search(r'(^[\s]*[+-][ ].+$[\n]*){1,3}', text, re.MULTILINE)
[docs]def match_md_or_rst_unordered_list(text): """Return first match of Markdown/ReST unordered list using shared * marker.""" return re.search(r'(^[\s]*[\*][ ].+$[\n]*){1,3}', text, re.MULTILINE)
[docs]def match_md_or_rst_ordered_list(text): """Return the first match of Markdown/ReST ordered list (using numbers).""" return re.search(r'(^[\s]*[0-9]+.[ ].+$[\n]*){1,3}', text, re.MULTILINE)
[docs]def match_rst_ordered_list(text): return re.search(r'(^[\s]*[#]\.[ ].+$[\n]*){1,3}', text, re.MULTILINE)
[docs]def check_markers(markers): """ Checks the consistency of a markers dictionary. Returns a detector. """ markers_cut = {} for key, val in markers.items(): markers_cut[key] = {} for key2, val2 in val.items(): if val2: markers_cut[key][key2] = val2 if len(markers_cut['rst']) > 0: if len(markers_cut['md']) > 0: return { 'language': 'plain', 'errors': ('Inconsistency: Markdown and reStructuredText syntaxes are mixed:\n\n' 'Markdown: %s\n\nreStructuredText: %s' % ( markers_cut['md'].popitem(), markers_cut['rst'].popitem())) } elif len(markers_cut['plain_or_md']) > 0: return { 'language': 'plain', 'errors': ('Inconsistency: plain/Markdown and reStructuredText ' 'syntaxes are mixed:\n\n' 'Markdown: %s\n\nreStructuredText: %s' % ( markers_cut['plain_or_md'].popitem(), markers_cut['rst'].popitem())) } return { 'language': 'reStructuredText', 'errors': None, } elif len(markers_cut['md']) > 0: return { 'language': 'Markdown', 'errors': None, } elif len(markers_cut['md_or_rst']) > 0: # markup, but indeterminate; assume Markdown return { 'language': 'Markdown', 'errors': None, } return { 'language': 'plain', 'errors': None, }
[docs]def detect_markup_language(text): """ Detect whether text is plain text, Markdown or reStructuredText. This method returns a dictionary containing: * language * errors Inline and displayed maths are assumed enabled through MathJax. For plain text and Markdown, this assumes the conventions * inline: $ ... $ and \( ... \) * displayed: $$ ... $$ and \[ ... \] while for reStructuredText, the ``math`` role and directive are used. We define markers, and indicator. A marker is a regex which occurs in only one of the languages. An indicator occurs in more than one, but not all languages. Language markers: Markdown: * headers: [one or more #] [non-empty text] * blockquotes: one or more lines starting with > [non-empty text] reStructuredText: * use of the :math: role or .. math: directive * [two or more #][blank space][carriage return] [text on a single line, as long as or shorter than # sequence] [same length of #] * same thing but for * headlines * other header markers (=, -, \" and \^) * use of any other role * use of any other directive Language indicators: Plain text or Markdown: * inline or displayed maths Markdown or reStructuredText: * [=]+ alone on a line <- users discouraged to use this in Markdown * [-]+ alone on a line <- users discouraged to use this in Markdown Exclusions (sources of errors): * inline or displayed maths cannot be used in ReST Any simultaneously present markers to two different languages return an error. Checking order: * maths * headers/blockquotes * hyperlinks * rst roles * rst directives """ if not text: return { 'language': 'plain', 'errors': None, } markers = { 'plain_or_md': {}, 'md': {}, 'md_or_rst': {}, 'rst': {}, } # Maths # Inline maths is of the form $ ... $ or \( ... \) markers['plain_or_md']['inline_math'] = match_inline_math(text) # Displayed maths is of the form \[ ... \] or $$ ... $$ markers['plain_or_md']['displayed_math'] = match_displayed_math(text) # For rst, check math role and directive markers['rst']['math_role'] = match_rst_role(text, 'math') markers['rst']['math_directive'] = match_rst_directive(text, 'math') # Headers and blockquotes markers['md']['header'] = match_md_header(text) markers['md']['blockquote'] = match_md_blockquote(text) markers['rst']['header'] = match_rst_header(text) # Lists markers['md']['unordered_list'] = match_md_unordered_list(text) markers['md_or_rst']['unordered_list'] = match_md_or_rst_unordered_list(text) markers['md_or_rst']['ordered_list'] = match_md_or_rst_ordered_list(text) markers['rst']['ordered_list'] = match_rst_ordered_list(text) # Hyperrefs markers['md']['href_inline'] = match_md_hyperlink_inline(text) markers['md']['href_reference'] = match_md_hyperlink_reference(text) markers['rst']['href_inline'] = match_rst_hyperlink_inline(text) markers['rst']['href_reference'] = match_rst_hyperlink_reference(text) # ReST roles and directives markers['rst']['role'] = match_rst_role(text) markers['rst']['directive'] = match_rst_directive(text) detector = check_markers(markers) return detector
[docs]def apply_markdown_preserving_displayed_maths_bracket(text): """ Subsidiary function called by ``apply_markdown_preserving_displayed_maths``. See explanations in docstring of that method. """ part = text.partition(r'\[') part2 = part[2].partition(r'\]') return '%s%s%s%s%s' % ( markdown.markdown(part[0], output_format='html5'), part[1], part2[0], part2[1], apply_markdown_preserving_displayed_maths_bracket(part2[2]) if len(part2[2]) > 0 else '')
[docs]def apply_markdown_preserving_displayed_maths(text): """ Processes the string text by first splitting out displayed maths, then applying Markdown on the non-displayed math parts. Both ``$$ ... $$`` and ``\[ ... \]`` are recognized, so a double recursive logic is used, first dealing with the ``$$ ... $$`` and then with the ``\[ .. \]``. See the complementary method ``apply_markdown_preserving_displayed_maths_bracket``. """ part = text.partition('$$') part2 = part[2].partition('$$') return '%s%s%s%s%s' % ( apply_markdown_preserving_displayed_maths_bracket(part[0]), part[1], part2[0], part2[1], apply_markdown_preserving_displayed_maths(part2[2]) if len(part2[2]) > 0 else '')
[docs]def process_markup(text, language_forced=None): markup_detector = detect_markup_language(text) markup = { 'language': 'plain', 'errors': None, 'warnings': None, 'processed': '' } if language_forced and language_forced != markup_detector['language']: markup['warnings'] = ( 'Warning: markup language was forced to %s, while the detected one was %s.' ) % (language_forced, markup_detector['language']) language = language_forced if language_forced else markup_detector['language'] markup['language'] = language markup['errors'] = markup_detector['errors'] if markup['errors']: return markup if language == 'reStructuredText': warnStream = StringIO() try: parts = publish_parts( source=text, writer_name='html5_polyglot', settings_overrides={ 'math_output': 'MathJax https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js?config=TeX-MML-AM_CHTML,Safe', 'initial_header_level': 1, 'doctitle_xform': False, 'raw_enabled': False, 'file_insertion_enabled': False, 'warning_stream': warnStream }) markup['processed'] = mark_safe(force_text(parts['html_body'])) except: markup['errors'] = warnStream.getvalue() elif language == 'Markdown': markup['processed'] = mark_safe( bleach.clean( apply_markdown_preserving_displayed_maths(text), tags=BLEACH_ALLOWED_TAGS ) ) else: markup['processed'] = linebreaksbr(text) return markup