__copyright__ = "Copyright © Stichting SciPost (SciPost Foundation)"
__license__ = "AGPL v3"
import bleach
from docutils.core import publish_parts
import markdown
from io import StringIO
import re
from django.template.defaultfilters import linebreaksbr
from django.utils.encoding import force_text
from django.utils.safestring import mark_safe
from .constants import ReST_HEADER_REGEX_DICT, ReST_ROLES, ReST_DIRECTIVES, BLEACH_ALLOWED_TAGS
# Inline or displayed math
[docs]def match_inline_math(text):
"""Return first match object of regex search for inline math ``$...$`` or ``\(...\)``."""
match = re.search(r'\$[^$]+\$', text)
if match:
return match
return re.search(r'\\\(.+\\\)', text)
[docs]def match_displayed_math(text):
"""Return first match object of regex search for displayed math ``$$...$$`` or ``\[...\]``."""
match = re.search(r'\$\$.+\$\$', text, re.DOTALL)
if match:
return match
return re.search(r'\\\[.+\\\]', text, re.DOTALL)
# Headers
# Blockquotes
[docs]def match_md_blockquote(text):
"""Return first match of regex search for Markdown blockquote."""
return re.search(r'(^[ ]*>[ ].+){1,5}', text, re.DOTALL | re.MULTILINE)
# Hyperlinks
[docs]def match_md_hyperlink_inline(text):
"""Return first match of regex search for Markdown inline hyperlink."""
return re.search(r'\[.+\]\(((http)|(mailto)).+\)', text)
[docs]def match_md_hyperlink_reference(text):
"""Return first match of regex search for Markdown reference-style hyperlink."""
return re.search(r'\[.+\]: ((http)|(mailto)).+', text)
[docs]def match_rst_hyperlink_inline(text):
"""Return first match of regex search for reStructuredText inline hyperlink."""
return re.search(r'`.+<((http)|(mailto)).+>`_', text)
[docs]def match_rst_hyperlink_reference(text):
"""Return first match of regex search for reStructuredText reference-style hyperlink."""
# The match must not start with `_ (end of previous hyperlink) or contain
# a < (it's then assumed to be an inline hyperlink with <http...).
return re.search(r'`[^_][^<]+`_', text)
# reStructuredText roles and directives
[docs]def match_rst_role(text, role=None):
"""
Return first match object of regex search for given ReST role :role:`... .
If no role is given, all roles in ReST_ROLES are tested one by one.
"""
if not role:
for newrole in ReST_ROLES:
match = match_rst_role(text, newrole)
if match:
return match
return None
if role not in ReST_ROLES:
raise ValueError('this role is not listed in ReST roles')
return re.search(r':' + role + ':`.+`', text)
[docs]def match_rst_directive(text, directive=None):
"""
Return first match object of regex search for given ReST directive.
If no directive is given, all directives in ReST_DIRECTIVES are tested one by one.
The first one to three lines after the directive statement are also captured.
"""
if not directive:
for newdirective in ReST_DIRECTIVES:
match = match_rst_directive(text, newdirective)
if match:
return match
return None
if directive not in ReST_DIRECTIVES:
raise ValueError('this directive is not listed in ReST directives')
return re.search(r'^\.\. ' + directive + '::(.+)*(\n(.+)*){1,3}', text, re.MULTILINE)
# Lists
[docs]def match_md_unordered_list(text):
"""Return first match of Markdown list (excluding ReST-shared * pattern)."""
return re.search(r'(^[\s]*[+-][ ].+$[\n]*){1,3}', text, re.MULTILINE)
[docs]def match_md_or_rst_unordered_list(text):
"""Return first match of Markdown/ReST unordered list using shared * marker."""
return re.search(r'(^[\s]*[\*][ ].+$[\n]*){1,3}', text, re.MULTILINE)
[docs]def match_md_or_rst_ordered_list(text):
"""Return the first match of Markdown/ReST ordered list (using numbers)."""
return re.search(r'(^[\s]*[0-9]+.[ ].+$[\n]*){1,3}', text, re.MULTILINE)
[docs]def match_rst_ordered_list(text):
return re.search(r'(^[\s]*[#]\.[ ].+$[\n]*){1,3}', text, re.MULTILINE)
[docs]def check_markers(markers):
"""
Checks the consistency of a markers dictionary. Returns a detector.
"""
markers_cut = {}
for key, val in markers.items():
markers_cut[key] = {}
for key2, val2 in val.items():
if val2:
markers_cut[key][key2] = val2
if len(markers_cut['rst']) > 0:
if len(markers_cut['md']) > 0:
return {
'language': 'plain',
'errors': ('Inconsistency: Markdown and reStructuredText syntaxes are mixed:\n\n'
'Markdown: %s\n\nreStructuredText: %s' % (
markers_cut['md'].popitem(),
markers_cut['rst'].popitem()))
}
elif len(markers_cut['plain_or_md']) > 0:
return {
'language': 'plain',
'errors': ('Inconsistency: plain/Markdown and reStructuredText '
'syntaxes are mixed:\n\n'
'Markdown: %s\n\nreStructuredText: %s' % (
markers_cut['plain_or_md'].popitem(),
markers_cut['rst'].popitem()))
}
return {
'language': 'reStructuredText',
'errors': None,
}
elif len(markers_cut['md']) > 0:
return {
'language': 'Markdown',
'errors': None,
}
elif len(markers_cut['md_or_rst']) > 0: # markup, but indeterminate; assume Markdown
return {
'language': 'Markdown',
'errors': None,
}
return {
'language': 'plain',
'errors': None,
}
[docs]def detect_markup_language(text):
"""
Detect whether text is plain text, Markdown or reStructuredText.
This method returns a dictionary containing:
* language
* errors
Inline and displayed maths are assumed enabled through MathJax.
For plain text and Markdown, this assumes the conventions
* inline: $ ... $ and \( ... \)
* displayed: $$ ... $$ and \[ ... \]
while for reStructuredText, the ``math`` role and directive are used.
We define markers, and indicator. A marker is a regex which occurs
in only one of the languages. An indicator occurs in more than one,
but not all languages.
Language markers:
Markdown:
* headers: [one or more #] [non-empty text]
* blockquotes: one or more lines starting with > [non-empty text]
reStructuredText:
* use of the :math: role or .. math: directive
* [two or more #][blank space][carriage return]
[text on a single line, as long as or shorter than # sequence]
[same length of #]
* same thing but for * headlines
* other header markers (=, -, \" and \^)
* use of any other role
* use of any other directive
Language indicators:
Plain text or Markdown:
* inline or displayed maths
Markdown or reStructuredText:
* [=]+ alone on a line <- users discouraged to use this in Markdown
* [-]+ alone on a line <- users discouraged to use this in Markdown
Exclusions (sources of errors):
* inline or displayed maths cannot be used in ReST
Any simultaneously present markers to two different languages
return an error.
Checking order:
* maths
* headers/blockquotes
* hyperlinks
* rst roles
* rst directives
"""
if not text:
return {
'language': 'plain',
'errors': None,
}
markers = {
'plain_or_md': {},
'md': {},
'md_or_rst': {},
'rst': {},
}
# Maths
# Inline maths is of the form $ ... $ or \( ... \)
markers['plain_or_md']['inline_math'] = match_inline_math(text)
# Displayed maths is of the form \[ ... \] or $$ ... $$
markers['plain_or_md']['displayed_math'] = match_displayed_math(text)
# For rst, check math role and directive
markers['rst']['math_role'] = match_rst_role(text, 'math')
markers['rst']['math_directive'] = match_rst_directive(text, 'math')
# Headers and blockquotes
markers['md']['header'] = match_md_header(text)
markers['md']['blockquote'] = match_md_blockquote(text)
markers['rst']['header'] = match_rst_header(text)
# Lists
markers['md']['unordered_list'] = match_md_unordered_list(text)
markers['md_or_rst']['unordered_list'] = match_md_or_rst_unordered_list(text)
markers['md_or_rst']['ordered_list'] = match_md_or_rst_ordered_list(text)
markers['rst']['ordered_list'] = match_rst_ordered_list(text)
# Hyperrefs
markers['md']['href_inline'] = match_md_hyperlink_inline(text)
markers['md']['href_reference'] = match_md_hyperlink_reference(text)
markers['rst']['href_inline'] = match_rst_hyperlink_inline(text)
markers['rst']['href_reference'] = match_rst_hyperlink_reference(text)
# ReST roles and directives
markers['rst']['role'] = match_rst_role(text)
markers['rst']['directive'] = match_rst_directive(text)
detector = check_markers(markers)
return detector
[docs]def apply_markdown_preserving_displayed_maths_bracket(text):
"""
Subsidiary function called by ``apply_markdown_preserving_displayed_maths``.
See explanations in docstring of that method.
"""
part = text.partition(r'\[')
part2 = part[2].partition(r'\]')
return '%s%s%s%s%s' % (
markdown.markdown(part[0], output_format='html5'),
part[1],
part2[0],
part2[1],
apply_markdown_preserving_displayed_maths_bracket(part2[2]) if len(part2[2]) > 0 else '')
[docs]def apply_markdown_preserving_displayed_maths(text):
"""
Processes the string text by first splitting out displayed maths, then applying
Markdown on the non-displayed math parts.
Both ``$$ ... $$`` and ``\[ ... \]`` are recognized, so a double recursive logic is used,
first dealing with the ``$$ ... $$`` and then with the ``\[ .. \]``.
See the complementary method ``apply_markdown_preserving_displayed_maths_bracket``.
"""
part = text.partition('$$')
part2 = part[2].partition('$$')
return '%s%s%s%s%s' % (
apply_markdown_preserving_displayed_maths_bracket(part[0]),
part[1],
part2[0],
part2[1],
apply_markdown_preserving_displayed_maths(part2[2]) if len(part2[2]) > 0 else '')
[docs]def process_markup(text, language_forced=None):
markup_detector = detect_markup_language(text)
markup = {
'language': 'plain',
'errors': None,
'warnings': None,
'processed': ''
}
if language_forced and language_forced != markup_detector['language']:
markup['warnings'] = (
'Warning: markup language was forced to %s, while the detected one was %s.'
) % (language_forced, markup_detector['language'])
language = language_forced if language_forced else markup_detector['language']
markup['language'] = language
markup['errors'] = markup_detector['errors']
if markup['errors']:
return markup
if language == 'reStructuredText':
warnStream = StringIO()
try:
parts = publish_parts(
source=text,
writer_name='html5_polyglot',
settings_overrides={
'math_output': 'MathJax https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js?config=TeX-MML-AM_CHTML,Safe',
'initial_header_level': 1,
'doctitle_xform': False,
'raw_enabled': False,
'file_insertion_enabled': False,
'warning_stream': warnStream
})
markup['processed'] = mark_safe(force_text(parts['html_body']))
except:
markup['errors'] = warnStream.getvalue()
elif language == 'Markdown':
markup['processed'] = mark_safe(
bleach.clean(
apply_markdown_preserving_displayed_maths(text),
tags=BLEACH_ALLOWED_TAGS
)
)
else:
markup['processed'] = linebreaksbr(text)
return markup