Strings module
The strings module provides helpers for dealing with strings of text which could contain inline HTML. These are used for both source strings and translated strings.
StringValue
A fragment of HTML that only contains inline tags with all attributes stripped out.
Attributes:
Name | Type | Description |
---|---|---|
data |
str |
The HTML fragment. |
from_plaintext(text)
classmethod
Initialises a StringValue from a plain text string.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
text |
str |
The plain text to turn into a StringValue. |
required |
Returns:
Type | Description |
---|---|
StringValue |
The initialised StringValue. |
Source code in wagtail_localize/strings.py
@classmethod
def from_plaintext(cls, text):
"""
Initialises a StringValue from a plain text string.
Args:
text (str): The plain text to turn into a StringValue.
Returns:
StringValue: The initialised StringValue.
"""
# Escapes all HTML characters and replaces newlines with <br> tags
elements = []
for line in text.split("\n"):
if line:
elements.append(escape(line))
elements.append("<br>")
# Remove last element which is an extra br tag
elements.pop()
# Join the elements then pass through beautiful soup to normalize the HTML
return cls(str(BeautifulSoup("".join(elements), "html.parser")))
from_source_html(html)
classmethod
Initialises a StringValue from a HTML string.
Source HTML is the HTML you get in Wagtail field data. This contains HTML attributes that must first be stripped out before the string can be translated.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
html |
str |
The HTML to turn into a StringValue. |
required |
Returns:
Type | Description |
---|---|
tuple[StringValue, dict] |
The initialised StringValue and a dictionary of extracted HTML attributes. |
Source code in wagtail_localize/strings.py
@classmethod
def from_source_html(cls, html):
"""
Initialises a StringValue from a HTML string.
Source HTML is the HTML you get in Wagtail field data. This contains HTML attributes that
must first be stripped out before the string can be translated.
Args:
html (str): The HTML to turn into a StringValue.
Returns:
tuple[StringValue, dict]: The initialised StringValue and a dictionary of extracted HTML attributes.
"""
# Extracts attributes from any tags (eg, href from <a> tags) and stores a version
# with just the translatable HTML
soup = BeautifulSoup(html, "html.parser")
attrs = {}
counter = Counter()
def walk(soup):
for element in soup.children:
if isinstance(element, NavigableString):
pass
else:
# Extract HTML attributes replacing them with an ID
if element.attrs:
counter[element.name] += 1
element_id = element.name + str(counter[element.name])
attrs[element_id] = element.attrs
element.attrs = {"id": element_id}
# Traverse into element children
walk(element)
walk(soup)
validate_element(soup)
return cls(str(soup)), attrs
from_translated_html(html)
classmethod
Initialises a StringValue from a HTML string.
HTML attributes are stripped out before translation, so translated HTML does not need to have them stripped out.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
html |
str |
The HTML to turn into a StringValue. |
required |
Returns:
Type | Description |
---|---|
StringValue |
The initialised StringValue. |
Source code in wagtail_localize/strings.py
@classmethod
def from_translated_html(cls, html):
"""
Initialises a StringValue from a HTML string.
HTML attributes are stripped out before translation, so translated HTML does not
need to have them stripped out.
Args:
html (str): The HTML to turn into a StringValue.
Returns:
StringValue: The initialised StringValue.
"""
soup = BeautifulSoup(html, "html.parser")
validate_element(soup)
return cls(str(soup))
get_translatable_html(self)
Returns a HTML string without restoring any HTML attributes.
Note: If the string was initialised from plain text, all special characters will be escaped.
Returns:
Type | Description |
---|---|
str |
The HTML representation of the string without HTML attributes |
Source code in wagtail_localize/strings.py
def get_translatable_html(self):
"""
Returns a HTML string without restoring any HTML attributes.
Note: If the string was initialised from plain text, all special characters will be escaped.
Returns:
str: The HTML representation of the string without HTML attributes
"""
return self.data
render_html(self, attrs)
Returns a HTML representation of the string.
Note: If the string was initialised from plain text, all special characters will be escaped.
Returns:
Type | Description |
---|---|
str |
The HTML representation of the string. |
Source code in wagtail_localize/strings.py
def render_html(self, attrs):
"""
Returns a HTML representation of the string.
Note: If the string was initialised from plain text, all special characters will be escaped.
Returns:
str: The HTML representation of the string.
"""
return str(self.render_soup(attrs))
render_soup(self, attrs)
Returns a BeautifulSoup instance containing the string.
This is equivalent to: BeautifulSoup(string.render_html(attrs), "html.parser")
The .render_html() method calls this internally so it would be more performant to call this directly if a BeautifulSoup object is what you need.
Returns:
Type | Description |
---|---|
BeautifulSoup |
A BeautifulSoup object representing the HTML of the string. |
Source code in wagtail_localize/strings.py
def render_soup(self, attrs):
"""
Returns a BeautifulSoup instance containing the string.
This is equivalent to: ``BeautifulSoup(string.render_html(attrs), "html.parser")``
The .render_html() method calls this internally so it would be more performant to call this directly if a
BeautifulSoup object is what you need.
Returns:
BeautifulSoup: A BeautifulSoup object representing the HTML of the string.
"""
soup = BeautifulSoup(self.data, "html.parser")
def walk(soup):
for element in soup.children:
if isinstance(element, NavigableString):
pass
else:
# Restore HTML attributes
if "id" in element.attrs:
element.attrs = attrs[element.attrs["id"]]
# Traverse into element children
walk(element)
walk(soup)
return soup
render_text(self)
Returns a plain text representation of the string.
Note: If the string was initialised from HTML, all HTML tags will be stripped out.
Returns:
Type | Description |
---|---|
str |
The plain text representation of the string. |
Source code in wagtail_localize/strings.py
def render_text(self):
"""
Returns a plain text representation of the string.
Note: If the string was initialised from HTML, all HTML tags will be stripped out.
Returns:
str: The plain text representation of the string.
"""
soup = BeautifulSoup(self.data, "html.parser")
texts = []
def walk(soup):
for element in soup.children:
if isinstance(element, NavigableString):
texts.append(element)
elif element.name == "br":
texts.append("\n")
else:
walk(element)
walk(soup)
return "".join(texts)
extract_ids(template)
Extract link ids from one template string and return it in a set.
Source code in wagtail_localize/strings.py
def extract_ids(template):
"""Extract link ids from one template string and return it in a set."""
soup = BeautifulSoup(template, "html.parser")
ids = set()
for element in soup.descendants:
if not isinstance(element, Tag):
continue
if element.name == "a":
if "id" in element.attrs:
ids.add(element.attrs["id"])
return ids
extract_strings(html)
This function extracts translatable strings from an HTML fragment.
Inline elements and visible text are extracted together.
This also returns a list of hrefs that were found in the HTML, these are also included in the strings.
For example:
<h1>Foo</h1>
<p>
Bar
<ul>
<li><b>Baz</b></li>
</ul>
<a href="http://example.com">A link</a>
</p>
Will produce the following two outputs (as a 3-tuple)
<h1><text position="0"></h1>
<p>
<text position="1">
<ul>
<li><text position="2"></li>
</ul>
</p>
[
"Foo",
"Bar",
"<b>Baz</b>",
"<a href="http://example.com">A link</a>"
]
Parameters:
Name | Type | Description | Default |
---|---|---|---|
html |
str |
The HTML to extract strings from. |
required |
Returns:
Type | Description |
---|---|
tuple[str, list[tuple[StringValue, dict]]] |
Returns a template string, and list 2-tuples containing a StringValue and dict of HTML attribute |
Source code in wagtail_localize/strings.py
def extract_strings(html):
"""
This function extracts translatable strings from an HTML fragment.
Inline elements and visible text are extracted together.
This also returns a list of hrefs that were found in the HTML, these are also included in the strings.
For example:
<h1>Foo</h1>
<p>
Bar
<ul>
<li><b>Baz</b></li>
</ul>
<a href="http://example.com">A link</a>
</p>
Will produce the following two outputs (as a 3-tuple)
<h1><text position="0"></h1>
<p>
<text position="1">
<ul>
<li><text position="2"></li>
</ul>
</p>
[
"Foo",
"Bar",
"<b>Baz</b>",
"<a href="http://example.com">A link</a>"
]
Args:
html (str): The HTML to extract strings from.
Returns:
tuple[str, list[tuple[StringValue, dict]]]: Returns a template string, and list 2-tuples containing a
StringValue and dict of HTML attribute
"""
soup = BeautifulSoup(html, "html.parser")
def wrap(elements):
"""
Wraps the given elements with a <text> tag
The elements must be contiguous siblings or this might screw up the tree.
"""
elements = list(elements)
# Skip if there are no tags to wrap
# We can get here after filters below have been applied
if len(elements) == 0:
return
# If there is a single element and that is an inline tag, wrap just the contents.
# We only care about inline tags that wrap only part of a segment
if (
len(elements) == 1
and not isinstance(elements[0], NavigableString)
and elements[0].name != "a" # keep href translatable
and elements[0].name in INLINE_TAGS
):
wrap(elements[0].children)
return
def ignore_if_at_end(element):
"""
Returns True if the given element should be ignored if it is at one of the ends
"""
if isinstance(element, NavigableString):
return False
# Ignore if there are no text nodes
# This will exclude both <br> tags and empty inline tags
if not any(
isinstance(desc, NavigableString) for desc in element.descendants
):
return True
return False
if ignore_if_at_end(elements[0]):
wrap(elements[1:])
return
if ignore_if_at_end(elements[-1]):
wrap(elements[:-1])
return
value = "".join(
element.output_ready()
if isinstance(element, NavigableString)
else str(element)
for element in elements
)
if value and not value.isspace():
# Create <text> tag
elements[0].insert_before(soup.new_tag("text", value=value))
# Remove elements
for element in elements:
element.replaceWith("")
def walk(element):
"""
Walks the tree in depth first search post-order.
When it encounters an element that could be extracted, it wraps it with
a <text> tag. These are extracted in the next stage (because we want to
preserve order of occurance).
For example:
<p>
Foo
<ul>
<li>Bar</li>
</ul>
Baz
</p>
Is transformed to:
<p>
<text>Foo</text>
<ul>
<li><text><b>Bar</b></text></li>
</ul>
<text>Baz</text>
</p>
"""
if isinstance(element, NavigableString):
return False, False
has_block = False
has_wrap = False
buffer = []
for child in element.children:
child_has_wrap, is_block = walk(child)
if child_has_wrap:
has_wrap = True
if is_block:
has_block = True
if buffer:
wrap(buffer)
buffer = []
has_wrap = True
else:
if not child_has_wrap:
buffer.append(child)
if buffer and has_block:
wrap(buffer)
buffer = []
has_wrap = True
if element.name not in INLINE_TAGS:
if buffer:
wrap(buffer)
has_wrap = True
return has_wrap, True
return has_wrap, False
walk(soup)
# Now extract strings from the <text> tags
strings = []
position = 0
for element in soup.descendants:
if element.name == "text":
text = element.attrs.pop("value")
# Strip leading and trailing whitespace. We keep the values and reinsert them
# into the template
# This is probably not necessary, but just to be on the safe side
text, prefix = lstrip_keep(text)
text, suffix = rstrip_keep(text)
element.attrs["position"] = position
position += 1
string_val, attrs = StringValue.from_source_html(text)
strings.append((string_val, attrs))
if prefix:
element.insert_before(prefix)
if suffix:
element.insert_after(suffix)
return str(soup), strings
lstrip_keep(text)
Like lstrip, but also returns the whitespace that was stripped off
Source code in wagtail_localize/strings.py
def lstrip_keep(text):
"""
Like lstrip, but also returns the whitespace that was stripped off
"""
text_length = len(text)
new_text = text.lstrip()
prefix = text[0 : (text_length - len(new_text))]
return new_text, prefix
restore_strings(template, strings)
Inserts a list of strings into the template.
This reverses the extract_strings
function.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
template |
str |
The HTML template. |
required |
strings |
list[tuple[StringValue, dict]] |
A list of 2-tuples containing a StringValue and HTML attributes dict for each string to reinsert into the template. |
required |
Returns:
Type | Description |
---|---|
str |
A HTML blob with the strings inserted into the template. |
Source code in wagtail_localize/strings.py
def restore_strings(template, strings):
"""
Inserts a list of strings into the template.
This reverses the `extract_strings` function.
Args:
template (str): The HTML template.
strings (list[tuple[StringValue, dict]]): A list of 2-tuples containing a StringValue and HTML attributes dict
for each string to reinsert into the template.
Returns:
str: A HTML blob with the strings inserted into the template.
"""
soup = BeautifulSoup(template, "html.parser")
for text_element in soup.findAll("text"):
string, attrs = strings[int(text_element.get("position"))]
text_element.replaceWith(string.render_soup(attrs))
return str(soup)
rstrip_keep(text)
Like rstrip, but also returns the whitespace that was stripped off
Source code in wagtail_localize/strings.py
def rstrip_keep(text):
"""
Like rstrip, but also returns the whitespace that was stripped off
"""
text_length = len(text)
new_text = text.rstrip()
if text_length != len(new_text):
suffix = text[-(text_length - len(new_text)) :]
else:
suffix = ""
return new_text, suffix
validate_element(element)
Checks the given BeautifulSoup element for anything that we disallow from strings.
Source code in wagtail_localize/strings.py
def validate_element(element):
"""
Checks the given BeautifulSoup element for anything that we disallow from strings.
"""
if isinstance(element, NavigableString):
return
# Validate tag and attributes
if isinstance(element, Tag) and element.name != "[document]":
# Block tags are not allowed in strings
if element.name not in INLINE_TAGS:
raise ValueError(
_(
"<{}> tag is not allowed. Strings can only contain standard HTML inline tags (such as <b>, <a>)"
).format(element.name)
)
# Elements can't have attributes, except for <a> tags
keys = set(element.attrs.keys())
if element.name == "a" and "id" in keys:
keys.remove("id")
if keys:
raise ValueError(
_(
"Strings cannot have any HTML tags with attributes (except for 'id' in <a> tags)"
)
)
# Traverse children
for child_element in element.children:
validate_element(child_element)
validate_translation_links(translation_of, data)
Check that the link id in a translation are present in its source.
Source code in wagtail_localize/strings.py
def validate_translation_links(translation_of, data):
"""Check that the link id in a translation are present in its source."""
id1s, id2s = extract_ids(translation_of), extract_ids(data)
new_ids = id2s - id1s
if new_ids:
ids = ", ".join(sorted(new_ids))
raise ValueError(_("Unrecognised id found in an <a> tag: {}").format(ids))