Skip to content

Allow attr_list quoted values to contain curly braces #1414

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Mar 12, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Allow attr_list quoted values to contain curly braces
How it worked before:

  * Extract the content without allowing any `}` in it, and require that it ends with `}` - for block elements anchored to the end of the line, otherwise not.
  * Parse the content in more detail. No edge cases with `}` can arise. If parsing is interrupted by some unrecognized token, discard the rest of the string.

How it works now:

  * Extract the content *and allow* `}` in it, and require that it ends with `}` - for block elements it's anchored to the end of the line, otherwise not.
  * Parse the content in more detail. Allow `}` only within the quoted parts, otherwise interrupt parsing like for any other unrecognized token.
    If parsing is interrupted, there is remaining unrecognized text. Ideally perhaps we would bail out at this point entirely (and not recognize it as an attr_list), but to preserve historic behavior, any extra text before `}` is just discarded.
    If there is an extra `}` in the remaining text:
      * For block elements: that must mean that the attr_list syntax did not in fact terminate at the end of the line but earlier. So, bail out and do not register any attributes and do not change the original text.
      * For inline elements: that must mean that we just overmatched a bit, but that's OK, we just assign attrs as normal and put the extra text back into the string. As mentioned, any extra text *before* `}` is just discarded.
  • Loading branch information
oprypin committed Nov 11, 2023
commit d015e31bba674923d9459b793c40a6c76c8d4fd6
67 changes: 41 additions & 26 deletions markdown/extensions/attr_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,10 @@ def _handle_word(s, t):


_scanner = re.Scanner([
(r'[^ =]+=".*?"', _handle_double_quote),
(r"[^ =]+='.*?'", _handle_single_quote),
(r'[^ =]+=[^ =]+', _handle_key_value),
(r'[^ =]+', _handle_word),
(r'[^ =}]+=".*?"', _handle_double_quote),
(r"[^ =}]+='.*?'", _handle_single_quote),
(r'[^ =}]+=[^ =}]+', _handle_key_value),
(r'[^ =}]+', _handle_word),
(r' ', None)
])

Expand All @@ -76,7 +76,7 @@ def isheader(elem: Element) -> bool:

class AttrListTreeprocessor(Treeprocessor):

BASE_RE = r'\{\:?[ ]*([^\}\n ][^\}\n]*)[ ]*\}'
BASE_RE = r'\{\:?[ ]*([^\}\n ][^\n]*)[ ]*\}'
HEADER_RE = re.compile(r'[ ]+{}[ ]*$'.format(BASE_RE))
BLOCK_RE = re.compile(r'\n[ ]*{}[ ]*$'.format(BASE_RE))
INLINE_RE = re.compile(r'^{}'.format(BASE_RE))
Expand Down Expand Up @@ -106,49 +106,62 @@ def run(self, doc: Element) -> None:
# use tail of last child. no `ul` or `ol`.
m = RE.search(elem[-1].tail)
if m:
self.assign_attrs(elem, m.group(1))
elem[-1].tail = elem[-1].tail[:m.start()]
if not self.assign_attrs(elem, m.group(1), strict=True):
elem[-1].tail = elem[-1].tail[:m.start()]
elif pos is not None and pos > 0 and elem[pos-1].tail:
# use tail of last child before `ul` or `ol`
m = RE.search(elem[pos-1].tail)
if m:
self.assign_attrs(elem, m.group(1))
elem[pos-1].tail = elem[pos-1].tail[:m.start()]
if not self.assign_attrs(elem, m.group(1), strict=True):
elem[pos-1].tail = elem[pos-1].tail[:m.start()]
elif elem.text:
# use text. `ul` is first child.
m = RE.search(elem.text)
if m:
self.assign_attrs(elem, m.group(1))
elem.text = elem.text[:m.start()]
if not self.assign_attrs(elem, m.group(1), strict=True):
elem.text = elem.text[:m.start()]
elif len(elem) and elem[-1].tail:
# has children. Get from tail of last child
m = RE.search(elem[-1].tail)
if m:
self.assign_attrs(elem, m.group(1))
elem[-1].tail = elem[-1].tail[:m.start()]
if isheader(elem):
# clean up trailing #s
elem[-1].tail = elem[-1].tail.rstrip('#').rstrip()
if not self.assign_attrs(elem, m.group(1), strict=True):
elem[-1].tail = elem[-1].tail[:m.start()]
if isheader(elem):
# clean up trailing #s
elem[-1].tail = elem[-1].tail.rstrip('#').rstrip()
elif elem.text:
# no children. Get from text.
m = RE.search(elem.text)
if m:
self.assign_attrs(elem, m.group(1))
elem.text = elem.text[:m.start()]
if isheader(elem):
# clean up trailing #s
elem.text = elem.text.rstrip('#').rstrip()
if not self.assign_attrs(elem, m.group(1), strict=True):
elem.text = elem.text[:m.start()]
if isheader(elem):
# clean up trailing #s
elem.text = elem.text.rstrip('#').rstrip()
else:
# inline: check for `attrs` at start of tail
if elem.tail:
m = self.INLINE_RE.match(elem.tail)
if m:
self.assign_attrs(elem, m.group(1))
elem.tail = elem.tail[m.end():]
remainder = self.assign_attrs(elem, m.group(1))
elem.tail = elem.tail[m.end():] + remainder

def assign_attrs(self, elem: Element, attrs_string: str, *, strict: bool = False) -> str:
""" Assign `attrs` to element.

If the `attrs_string` has an extra closing curly brace, the remaining text is returned.

The `strict` argument controls whether to still assign attrs if there is a remaining `}`.
"""
attrs, remainder = _scanner.scan(attrs_string)
# To keep historic behavior, discard all un-parseable text prior to '}'.
index = remainder.find('}')
remainder = remainder[index:] if index != -1 else ''

if strict and remainder:
return remainder

def assign_attrs(self, elem: Element, attrs: str) -> None:
""" Assign `attrs` to element. """
for k, v in get_attrs(attrs):
for k, v in attrs:
if k == '.':
# add to class
cls = elem.get('class')
Expand All @@ -159,6 +172,8 @@ def assign_attrs(self, elem: Element, attrs: str) -> None:
else:
# assign attribute `k` with `v`
elem.set(self.sanitize_name(k), v)
# The text that we initially over-matched will be put back.
return remainder

def sanitize_name(self, name: str) -> str:
"""
Expand Down
7 changes: 6 additions & 1 deletion tests/extensions/attr_list.html
Original file line number Diff line number Diff line change
Expand Up @@ -66,4 +66,9 @@ <h1>Bad attributes</h1>
<p><em>More weirdness</em></p>
<p>This should not cause a <em foo="a">crash</em></p>
<p>Attr_lists do not contain <em>newlines</em>{ foo=bar
key=value }</p>
key=value }</p>
<h1 data-test="{}">Attrs</h1>
<p>attr_list values can have curly <em data-test="{hi{}" foo="bar">braces</em></p>
<h2>attr_list curly needs to be at the end {.foo} hi</h2>
<h2>attr_list curly needs to be at the end {.foo test=&rdquo;{&rdquo; } }</h2>
<p><em class="a">Multiple</em> } <em class="b">items</em> inline</p>
10 changes: 10 additions & 0 deletions tests/extensions/attr_list.txt
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,13 @@ This should not cause a *crash*{ foo=a=b }

Attr_lists do not contain *newlines*{ foo=bar
key=value }

# Attrs {data-test="{}"}

attr_list values can have curly *braces*{ data-test='{hi{}' foo="bar" }

## attr_list curly needs to be at the end {.foo} hi

## attr_list curly needs to be at the end {.foo test="{" } }

*Multiple*{.a} } *items*{.b} inline