Allow attr_list quoted values to contain curly braces

How it worked before: * Extract the content without allowing any `}` in it, and require that it ends with `}` - for block elements anchored to the end of the line, otherwise not. * Parse the content in more detail. No edge cases with `}` can arise. If parsing is interrupted by some unrecognized token, discard the rest of the string. How it works now: * Extract the content *and allow* `}` in it, and require that it ends with `}` - for block elements it's anchored to the end of the line, otherwise not. * Parse the content in more detail. Allow `}` only within the quoted parts, otherwise interrupt parsing like for any other unrecognized token. If parsing is interrupted, there is remaining unrecognized text. Ideally perhaps we would bail out at this point entirely (and not recognize it as an attr_list), but to preserve historic behavior, any extra text before `}` is just discarded. If there is an extra `}` in the remaining text: * For block elements: that must mean that the attr_list syntax did not in fact terminate at the end of the line but earlier. So, bail out and do not register any attributes and do not change the original text. * For inline elements: that must mean that we just overmatched a bit, but that's OK, we just assign attrs as normal and put the extra text back into the string. As mentioned, any extra text *before* `}` is just discarded.
Python-Markdown · waylan · Mar 12, 2024 · Nov 10, 2023 · Mar 8, 2024 · Mar 8, 2024
commit d015e31bba674923d9459b793c40a6c76c8d4fd6
diff --git a/markdown/extensions/attr_list.py b/markdown/extensions/attr_list.py
@@ -57,10 +57,10 @@ def _handle_word(s, t):
 
 
 _scanner = re.Scanner([
-    (r'[^ =]+=".*?"', _handle_double_quote),
-    (r"[^ =]+='.*?'", _handle_single_quote),
-    (r'[^ =]+=[^ =]+', _handle_key_value),
-    (r'[^ =]+', _handle_word),
+    (r'[^ =}]+=".*?"', _handle_double_quote),
+    (r"[^ =}]+='.*?'", _handle_single_quote),
+    (r'[^ =}]+=[^ =}]+', _handle_key_value),
+    (r'[^ =}]+', _handle_word),
     (r' ', None)
 ])
 
@@ -76,7 +76,7 @@ def isheader(elem: Element) -> bool:
 
 class AttrListTreeprocessor(Treeprocessor):
 
-    BASE_RE = r'\{\:?[ ]*([^\}\n ][^\}\n]*)[ ]*\}'
+    BASE_RE = r'\{\:?[ ]*([^\}\n ][^\n]*)[ ]*\}'
     HEADER_RE = re.compile(r'[ ]+{}[ ]*$'.format(BASE_RE))
     BLOCK_RE = re.compile(r'\n[ ]*{}[ ]*$'.format(BASE_RE))
     INLINE_RE = re.compile(r'^{}'.format(BASE_RE))
@@ -106,49 +106,62 @@ def run(self, doc: Element) -> None:
                         # use tail of last child. no `ul` or `ol`.
                         m = RE.search(elem[-1].tail)
                         if m:
-                            self.assign_attrs(elem, m.group(1))
-                            elem[-1].tail = elem[-1].tail[:m.start()]
+                            if not self.assign_attrs(elem, m.group(1), strict=True):
+                                elem[-1].tail = elem[-1].tail[:m.start()]
                     elif pos is not None and pos > 0 and elem[pos-1].tail:
                         # use tail of last child before `ul` or `ol`
                         m = RE.search(elem[pos-1].tail)
                         if m:
-                            self.assign_attrs(elem, m.group(1))
-                            elem[pos-1].tail = elem[pos-1].tail[:m.start()]
+                            if not self.assign_attrs(elem, m.group(1), strict=True):
+                                elem[pos-1].tail = elem[pos-1].tail[:m.start()]
                     elif elem.text:
                         # use text. `ul` is first child.
                         m = RE.search(elem.text)
                         if m:
-                            self.assign_attrs(elem, m.group(1))
-                            elem.text = elem.text[:m.start()]
+                            if not self.assign_attrs(elem, m.group(1), strict=True):
+                                elem.text = elem.text[:m.start()]
                 elif len(elem) and elem[-1].tail:
                     # has children. Get from tail of last child
                     m = RE.search(elem[-1].tail)
                     if m:
-                        self.assign_attrs(elem, m.group(1))
-                        elem[-1].tail = elem[-1].tail[:m.start()]
-                        if isheader(elem):
-                            # clean up trailing #s
-                            elem[-1].tail = elem[-1].tail.rstrip('#').rstrip()
+                        if not self.assign_attrs(elem, m.group(1), strict=True):
+                            elem[-1].tail = elem[-1].tail[:m.start()]
+                            if isheader(elem):
+                                # clean up trailing #s
+                                elem[-1].tail = elem[-1].tail.rstrip('#').rstrip()
                 elif elem.text:
                     # no children. Get from text.
                     m = RE.search(elem.text)
                     if m:
-                        self.assign_attrs(elem, m.group(1))
-                        elem.text = elem.text[:m.start()]
-                        if isheader(elem):
-                            # clean up trailing #s
-                            elem.text = elem.text.rstrip('#').rstrip()
+                        if not self.assign_attrs(elem, m.group(1), strict=True):
+                            elem.text = elem.text[:m.start()]
+                            if isheader(elem):
+                                # clean up trailing #s
+                                elem.text = elem.text.rstrip('#').rstrip()
             else:
                 # inline: check for `attrs` at start of tail
                 if elem.tail:
                     m = self.INLINE_RE.match(elem.tail)
                     if m:
-                        self.assign_attrs(elem, m.group(1))
-                        elem.tail = elem.tail[m.end():]
+                        remainder = self.assign_attrs(elem, m.group(1))
+                        elem.tail = elem.tail[m.end():] + remainder
+
+    def assign_attrs(self, elem: Element, attrs_string: str, *, strict: bool = False) -> str:
+        """ Assign `attrs` to element.
+
+        If the `attrs_string` has an extra closing curly brace, the remaining text is returned.
+
+        The `strict` argument controls whether to still assign attrs if there is a remaining `}`.
+        """
+        attrs, remainder = _scanner.scan(attrs_string)
+        # To keep historic behavior, discard all un-parseable text prior to '}'.
+        index = remainder.find('}')
+        remainder = remainder[index:] if index != -1 else ''
+
+        if strict and remainder:
+            return remainder
 
-    def assign_attrs(self, elem: Element, attrs: str) -> None:
-        """ Assign `attrs` to element. """
-        for k, v in get_attrs(attrs):
+        for k, v in attrs:
             if k == '.':
                 # add to class
                 cls = elem.get('class')
@@ -159,6 +172,8 @@ def assign_attrs(self, elem: Element, attrs: str) -> None:
             else:
                 # assign attribute `k` with `v`
                 elem.set(self.sanitize_name(k), v)
+        # The text that we initially over-matched will be put back.
+        return remainder
 
     def sanitize_name(self, name: str) -> str:
         """

diff --git a/tests/extensions/attr_list.html b/tests/extensions/attr_list.html
@@ -66,4 +66,9 @@ <h1>Bad attributes</h1>
 <p><em>More weirdness</em></p>
 <p>This should not cause a <em foo="a">crash</em></p>
 <p>Attr_lists do not contain <em>newlines</em>{ foo=bar
-key=value }</p>
+key=value }</p>
+<h1 data-test="{}">Attrs</h1>
+<p>attr_list values can have curly <em data-test="{hi{}" foo="bar">braces</em></p>
+<h2>attr_list curly needs to be at the end {.foo} hi</h2>
+<h2>attr_list curly needs to be at the end {.foo test=&rdquo;{&rdquo; } }</h2>
+<p><em class="a">Multiple</em> } <em class="b">items</em> inline</p>
diff --git a/tests/extensions/attr_list.txt b/tests/extensions/attr_list.txt
@@ -92,3 +92,13 @@ This should not cause a *crash*{ foo=a=b }
 
 Attr_lists do not contain *newlines*{ foo=bar
 key=value }
+
+# Attrs {data-test="{}"}
+
+attr_list values can have curly *braces*{ data-test='{hi{}' foo="bar" }
+
+## attr_list curly needs to be at the end {.foo} hi
+
+## attr_list curly needs to be at the end {.foo test="{" } }
+
+*Multiple*{.a} } *items*{.b} inline