| Current Path : /var/www/surf/TYPO3/vendor/typo3/cms-core/Classes/Html/ |
| Current File : /var/www/surf/TYPO3/vendor/typo3/cms-core/Classes/Html/RteHtmlParser.php |
<?php
declare(strict_types=1);
/*
* This file is part of the TYPO3 CMS project.
*
* It is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License, either version 2
* of the License, or any later version.
*
* For the full copyright and license information, please read the
* LICENSE.txt file that was distributed with this source code.
*
* The TYPO3 project - inspiring people to share!
*/
namespace TYPO3\CMS\Core\Html;
use Psr\EventDispatcher\EventDispatcherInterface;
use Psr\Log\LoggerAwareInterface;
use Psr\Log\LoggerAwareTrait;
use TYPO3\CMS\Core\Configuration\Features;
use TYPO3\CMS\Core\Html\Event\BrokenLinkAnalysisEvent;
use TYPO3\CMS\Core\LinkHandling\Exception\UnknownLinkHandlerException;
use TYPO3\CMS\Core\LinkHandling\LinkService;
use TYPO3\CMS\Core\Resource\Exception\InsufficientFolderAccessPermissionsException;
use TYPO3\CMS\Core\Utility\GeneralUtility;
use TYPO3\HtmlSanitizer\Builder\BuilderInterface;
/**
* Class for parsing HTML for the Rich Text Editor. (also called transformations)
*
* Concerning line breaks:
* Regardless if LF (Unix-style) or CRLF (Windows) was put in, the HtmlParser works with LFs and migrates all
* line breaks to LFs internally, however when all transformations are done, all LFs are transformed to CRLFs.
* This means: RteHtmlParser always returns CRLFs to be maximum compatible with all formats.
*/
class RteHtmlParser extends HtmlParser implements LoggerAwareInterface
{
use LoggerAwareTrait;
/**
* List of elements that are not wrapped into a "p" tag while doing the transformation.
*/
protected string $blockElementList = 'DIV,TABLE,BLOCKQUOTE,PRE,UL,OL,H1,H2,H3,H4,H5,H6,ADDRESS,DL,DD,HEADER,SECTION,FOOTER,NAV,ARTICLE,ASIDE,FIGURE';
/**
* List of all tags that are allowed by default
*/
protected string $defaultAllowedTagsList = 'b,i,u,a,img,br,div,center,pre,figure,figcaption,font,hr,sub,sup,p,strong,em,li,ul,ol,blockquote,strike,span,abbr,acronym,dfn';
/**
* Set to the TSconfig options coming from page TSconfig
*/
protected array $procOptions = [];
/**
* Run-away brake for recursive calls.
*/
protected int $TS_transform_db_safecounter = 100;
/**
* Data caching for processing function
*/
protected array $getKeepTags_cache = [];
/**
* Storage of the allowed CSS class names in the RTE
*/
protected array $allowedClasses = [];
/**
* A list of HTML attributes for <p> tags. Because <p> tags are wrapped currently in a special handling,
* they have a special place for configuration via 'proc.keepPDIVattribs'
*/
protected array $allowedAttributesForParagraphTags = [
'class',
'align',
'id',
'title',
'dir',
'lang',
'xml:lang',
'itemscope',
'itemtype',
'itemprop',
];
/**
* Any tags that are allowed outside of <p> sections - usually similar to the block elements
* plus some special tags like <hr> and <img> (if images are allowed).
* Completely overrideable via 'proc.allowTagsOutside'
*/
protected array $allowedTagsOutsideOfParagraphs = [
'address',
'article',
'aside',
'blockquote',
'div',
'footer',
'figure',
'figcaption',
'header',
'hr',
'nav',
'section',
];
public function __construct(
protected readonly EventDispatcherInterface $eventDispatcher
) {}
/**
* Sanitize and streamline given options (usually from RichTextConfiguration results "proc."
* and set them to the respective properties.
*/
protected function setProcessingConfiguration(array $processingConfiguration): void
{
$this->procOptions = $processingConfiguration;
$this->getKeepTags_cache = [];
if (isset($this->procOptions['allowedClasses.'])) {
$this->allowedClasses = (array)$this->procOptions['allowedClasses.'];
} else {
$this->allowedClasses = GeneralUtility::trimExplode(',', $this->procOptions['allowedClasses'] ?? '', true);
}
// Dynamic configuration of blockElementList
if (!empty($this->procOptions['blockElementList'])) {
$this->blockElementList = $this->procOptions['blockElementList'];
}
// Define which attributes are allowed on <p> tags
if (isset($this->procOptions['allowAttributes.'])) {
$this->allowedAttributesForParagraphTags = $this->procOptions['allowAttributes.'];
}
// Override tags which are allowed outside of <p> tags
if (isset($this->procOptions['allowTagsOutside'])) {
if (!isset($this->procOptions['allowTagsOutside.'])) {
$this->allowedTagsOutsideOfParagraphs = GeneralUtility::trimExplode(',', strtolower($this->procOptions['allowTagsOutside']), true);
} else {
$this->allowedTagsOutsideOfParagraphs = (array)$this->procOptions['allowTagsOutside.'];
}
}
}
/**
* Main entry point for transforming RTE content in the database so the Rich Text Editor can deal with
* e.g. links.
*/
public function transformTextForRichTextEditor(string $value, array $processingConfiguration): string
{
$this->setProcessingConfiguration($processingConfiguration);
$modes = $this->resolveAppliedTransformationModes('rte');
$value = $this->streamlineLineBreaksForProcessing($value);
// If an entry HTML cleaner was configured, pass the content through the HTMLcleaner
$value = $this->runHtmlParserIfConfigured($value, 'entryHTMLparser_rte');
// Traverse modes
foreach ($modes as $cmd) {
switch ($cmd) {
case 'detectbrokenlinks':
$value = $this->markBrokenLinks($value);
break;
case 'css_transform':
$value = $this->TS_transform_rte($value);
break;
default:
// Do nothing
}
}
// If an exit HTML cleaner was configured, pass the content through the HTMLcleaner
$value = $this->runHtmlParserIfConfigured($value, 'exitHTMLparser_rte');
// Final clean up of linebreaks
$value = $this->streamlineLineBreaksAfterProcessing($value);
return $value;
}
/**
* Called to process HTML content before it is stored in the database.
*/
public function transformTextForPersistence(string $value, array $processingConfiguration): string
{
$this->setProcessingConfiguration($processingConfiguration);
$modes = $this->resolveAppliedTransformationModes('db');
$value = $this->streamlineLineBreaksForProcessing($value);
// If an entry HTML cleaner was configured, pass the content through the HTMLcleaner
$value = $this->runHtmlParserIfConfigured($value, 'entryHTMLparser_db');
// Traverse modes
foreach ($modes as $cmd) {
switch ($cmd) {
case 'detectbrokenlinks':
$value = $this->removeBrokenLinkMarkers($value);
break;
case 'ts_links':
$value = $this->TS_links_db($value);
break;
case 'css_transform':
// Transform empty paragraphs into spacing paragraphs
$value = str_replace('<p></p>', '<p> </p>', $value);
// Double any trailing spacing paragraph so that it does not get removed by divideIntoLines()
$value = preg_replace('/<p> <\/p>$/', '<p> </p><p> </p>', $value) ?? $value;
$value = $this->TS_transform_db($value);
break;
default:
// Do nothing
}
}
// process markup with HTML Sanitizer
$value = $this->htmlSanitize($value, $this->procOptions['HTMLparser_db.'] ?? []);
// If an exit HTML cleaner was configured, pass the content through the HTMLcleaner
$value = $this->runHtmlParserIfConfigured($value, 'exitHTMLparser_db');
// Final clean up of linebreaks
$value = $this->streamlineLineBreaksAfterProcessing($value);
return $value;
}
/**
* Ensures what transformation modes should be executed, and that they are only executed once.
*
* @return array the resolved transformation modes
*/
protected function resolveAppliedTransformationModes(string $direction): array
{
// Setting modes / transformations to be called
if ((string)($this->procOptions['overruleMode'] ?? '') !== '') {
$modes = GeneralUtility::trimExplode(',', $this->procOptions['overruleMode']);
} else {
$modes = [$this->procOptions['mode']];
}
$modeList = implode(',', $modes);
// Replace the shortcut "default" with all custom modes
$modeList = str_replace('default', 'detectbrokenlinks,css_transform,ts_links', $modeList);
// Make list unique
$modes = array_unique(GeneralUtility::trimExplode(',', $modeList, true));
// Reverse order if direction is "rte"
if ($direction === 'rte') {
$modes = array_reverse($modes);
}
return $modes;
}
/**
* Runs the HTML parser if it is configured
* Getting additional HTML cleaner configuration. These are applied either before or after the main transformation
* is done and thus totally independent processing options you can set up.
*
* This is only possible via TSconfig (procOptions) currently.
*
* @param string $configurationDirective used to look up in the procOptions if enabled, and then fetch the
* @return string the processed content
*/
protected function runHtmlParserIfConfigured(string $content, string $configurationDirective): string
{
if (!empty($this->procOptions[$configurationDirective])) {
[$keepTags, $keepNonMatchedTags, $hscMode, $additionalConfiguration] = $this->HTMLparserConfig($this->procOptions[$configurationDirective . '.']);
$content = $this->HTMLcleaner($content, $keepTags, $keepNonMatchedTags, $hscMode, $additionalConfiguration);
}
return $content;
}
/************************************
*
* Specific RTE TRANSFORMATION functions
*
*************************************/
/**
* Transformation handler: 'ts_links' / direction: "db"
* Processing anchor tags, and resolves them correctly again via the LinkService syntax
*
* Splits content into <a> tag blocks and processes each tag, and allows hooks to actually render
* the result.
*
* @param string $value Content input
* @return string Content output
*/
protected function TS_links_db(string $value): string
{
$blockSplit = $this->splitIntoBlock('A', $value);
foreach ($blockSplit as $k => $v) {
if ($k % 2) {
[$tagAttributes] = $this->get_tag_attributes($this->getFirstTag($v), true);
// Anchors would not have an href attribute
if (!isset($tagAttributes['href'])) {
continue;
}
$linkService = GeneralUtility::makeInstance(LinkService::class);
// Store the link as <a> tag as default by TYPO3, with the link service syntax
try {
$linkInformation = $linkService->resolve($tagAttributes['href'] ?? '');
$tagAttributes['href'] = $linkService->asString($linkInformation);
} catch (UnknownLinkHandlerException $e) {
$tagAttributes['href'] = $linkInformation['href'] ?? $tagAttributes['href'];
}
$blockSplit[$k] = '<a ' . GeneralUtility::implodeAttributes($tagAttributes, true) . '>'
. $this->TS_links_db($this->removeFirstAndLastTag($blockSplit[$k])) . '</a>';
}
}
return implode('', $blockSplit);
}
/**
* Transformation handler: 'css_transform' / direction: "db"
* Cleaning (->db) for standard content elements (ts)
*
* @param string $value Content input
* @return string Content output
* @see TS_transform_rte()
*/
protected function TS_transform_db(string $value): string
{
// Safety... so forever loops are avoided (they should not occur, but an error would potentially do this...)
$this->TS_transform_db_safecounter--;
if ($this->TS_transform_db_safecounter < 0) {
return $value;
}
// Split the content from RTE by the occurrence of these blocks:
$blockSplit = $this->splitIntoBlock($this->blockElementList, $value);
// Avoid superfluous linebreaks by transform_db after ending headListTag
while (count($blockSplit) > 0 && trim(end($blockSplit)) === '') {
array_pop($blockSplit);
}
// Traverse the blocks
foreach ($blockSplit as $k => $v) {
if ($k % 2) {
// Inside block:
// Init:
$tag = $this->getFirstTag($v);
$tagName = strtolower($this->getFirstTagName($v));
// Process based on the tag:
switch ($tagName) {
case 'blockquote':
case 'dd':
case 'div':
case 'header':
case 'section':
case 'footer':
case 'nav':
case 'article':
case 'aside':
$blockSplit[$k] = $tag . $this->TS_transform_db($this->removeFirstAndLastTag($blockSplit[$k])) . '</' . $tagName . '>';
break;
case 'pre':
break;
default:
// usually <hx> tags and <table> tags where no other block elements are within the tags
// Eliminate true linebreaks inside block element tags
$blockSplit[$k] = preg_replace('/[' . LF . ']+/', ' ', $blockSplit[$k]);
}
} else {
// NON-block:
if (trim($blockSplit[$k]) !== '') {
$string = $blockSplit[$k];
$string = preg_replace('#<([a-z]+)/>#', '<$1 />', $string);
// Remove linebreaks preceding hr tags
$string = preg_replace('/[' . LF . ']+<(hr)(\\s[^>\\/]*)?[[:space:]]*\\/?>/', '<$1$2/>', $string) ?? '';
// Remove linebreaks following hr tags
$string = preg_replace('/<(hr)(\\s[^>\\/]*)?[[:space:]]*\\/?>[' . LF . ']+/', '<$1$2/>', $string) ?? '';
// Replace other linebreaks with space
$string = preg_replace('/[' . LF . ']+/', ' ', $string);
// process allowed/removed tags
$string = $this->HTMLcleaner(
(string)$string,
$this->getKeepTags('db'),
$this->procOptions['HTMLparser_db.']['keepNonMatchedTags'] ?? '',
(int)($this->procOptions['HTMLparser_db.']['htmlSpecialChars'] ?? 0)
);
$blockSplit[$k] = (string)$this->divideIntoLines($string);
} else {
unset($blockSplit[$k]);
}
}
}
$this->TS_transform_db_safecounter++;
return implode(LF, $blockSplit);
}
/**
* Transformation handler: css_transform / direction: "rte"
* Set (->rte) for standard content elements (ts)
*
* @param string $value Content input
* @return string Content output
* @see TS_transform_db()
*/
protected function TS_transform_rte(string $value): string
{
// Split the content from database by the occurrence of the block elements
$blockSplit = $this->splitIntoBlock($this->blockElementList, $value);
// Traverse the blocks
foreach ($blockSplit as $k => $v) {
if ($k % 2) {
// Inside one of the blocks:
// Init:
$tag = $this->getFirstTag($v);
$tagName = strtolower($this->getFirstTagName($v));
// Based on tagname, we do transformations:
switch ($tagName) {
case 'blockquote':
case 'dd':
case 'div':
case 'header':
case 'section':
case 'footer':
case 'nav':
case 'article':
case 'aside':
$blockSplit[$k] = $tag . $this->TS_transform_rte($this->removeFirstAndLastTag($blockSplit[$k])) . '</' . $tagName . '>';
break;
}
if (!isset($blockSplit[$k + 1])) {
$blockSplit[$k + 1] = '';
}
$blockSplit[$k + 1] = preg_replace('/^[ ]*' . LF . '/', '', $blockSplit[$k + 1]);
} else {
// NON-block:
$nextFTN = $this->getFirstTagName($blockSplit[$k + 1] ?? '');
$onlyLineBreaks = (preg_match('/^[ ]*' . LF . '+[ ]*$/', $blockSplit[$k]) == 1);
// If the line is followed by a block or is the last line:
if (GeneralUtility::inList($this->blockElementList, $nextFTN) || !isset($blockSplit[$k + 1])) {
// If the line contains more than just linebreaks, reduce the number of trailing linebreaks by 1
if (!$onlyLineBreaks) {
$blockSplit[$k] = preg_replace('/(' . LF . '*)' . LF . '[ ]*$/', '$1', $blockSplit[$k]);
} else {
// If the line contains only linebreaks, remove the leading linebreak
$blockSplit[$k] = preg_replace('/^[ ]*' . LF . '/', '', $blockSplit[$k]);
}
}
// If $blockSplit[$k] is blank then unset the line, unless the line only contained linebreaks
if ((string)$blockSplit[$k] === '' && !$onlyLineBreaks) {
unset($blockSplit[$k]);
} else {
$blockSplit[$k] = $this->setDivTags($blockSplit[$k]);
}
}
}
return implode(LF, $blockSplit);
}
/***************************************************************
*
* Generic RTE transformation, analysis and helper functions
*
**************************************************************/
/**
* Function for cleaning content going into the database.
* Content is cleaned eg. by removing unallowed HTML and ds-HSC content
* It is basically calling HTMLcleaner from the parent class with some preset configuration specifically set up for cleaning content going from the RTE into the db
*
* @param string $content Content to clean up
* @return string Clean content
* @see getKeepTags()
*/
protected function HTMLcleaner_db(string $content): string
{
$keepTags = $this->getKeepTags('db');
return $this->HTMLcleaner($content, $keepTags, false);
}
/**
* Creates an array of configuration for the HTMLcleaner function based on whether content
* go TO or FROM the Rich Text Editor ($direction)
*
* @param string $direction The direction of the content being processed by the output configuration; "db" (content going into the database FROM the rte) or "rte" (content going into the form)
* @return array Configuration array
* @see HTMLcleaner_db()
*/
protected function getKeepTags(string $direction): array
{
if (!isset($this->getKeepTags_cache[$direction]) || !is_array($this->getKeepTags_cache[$direction])) {
// Setting up allowed tags:
// Default is to get allowed/denied tags from internal array of processing options:
// Construct default list of tags to keep:
if (isset($this->procOptions['allowTags.']) && is_array($this->procOptions['allowTags.'])) {
$keepTags = implode(',', $this->procOptions['allowTags.']);
} else {
$keepTags = $this->procOptions['allowTags'] ?? '';
}
$keepTags = array_flip(GeneralUtility::trimExplode(',', $this->defaultAllowedTagsList . ',' . strtolower($keepTags), true));
// For tags to deny, remove them from $keepTags array:
$denyTags = GeneralUtility::trimExplode(',', $this->procOptions['denyTags'] ?? '', true);
foreach ($denyTags as $dKe) {
unset($keepTags[$dKe]);
}
// Based on the direction of content, set further options:
switch ($direction) {
case 'rte':
// Transforming keepTags array so it can be understood by the HTMLcleaner function.
// This basically converts the format of the array from TypoScript (having dots) to plain multi-dimensional array.
[$keepTags] = $this->HTMLparserConfig($this->procOptions['HTMLparser_rte.'] ?? [], $keepTags);
break;
case 'db':
// Setting up span tags if they are allowed:
if (isset($keepTags['span'])) {
$keepTags['span'] = [
'allowedAttribs' => 'id,class,style,title,lang,xml:lang,dir,itemscope,itemtype,itemprop',
'fixAttrib' => [
'class' => [
'removeIfFalse' => 1,
],
],
'rmTagIfNoAttrib' => 1,
];
if (!empty($this->allowedClasses)) {
$keepTags['span']['fixAttrib']['class']['list'] = $this->allowedClasses;
}
}
// Setting further options, getting them from the processing options
$TSc = $this->procOptions['HTMLparser_db.'] ?? [];
if (empty($TSc['globalNesting'])) {
$TSc['globalNesting'] = 'b,i,u,a,center,font,sub,sup,strong,em,strike,span';
}
if (empty($TSc['noAttrib'])) {
$TSc['noAttrib'] = 'b,i,u,br,center,hr,sub,sup,strong,em,li,ul,ol,blockquote,strike';
}
// Transforming the array from TypoScript to regular array:
[$keepTags] = $this->HTMLparserConfig($TSc, $keepTags);
break;
}
// Caching (internally, in object memory) the result
$this->getKeepTags_cache[$direction] = $keepTags;
}
// Return result:
return $this->getKeepTags_cache[$direction];
}
/**
* This resolves the $value into parts based on <p>-sections. These are returned as lines separated by LF.
* This point is to resolve the HTML-code returned from RTE into ordinary lines so it's 'human-readable'
* The function ->setDivTags does the opposite.
* This function processes content to go into the database.
*
* @param string $value Value to process.
* @param int $count Recursion brake. Decremented on each recursion down to zero. Default is 5 (which equals the allowed nesting levels of p tags).
* @param bool $returnArray If TRUE, an array with the lines is returned, otherwise a string of the processed input value.
* @return string|array Processed input value.
* @see setDivTags()
*/
protected function divideIntoLines(string $value, int $count = 5, bool $returnArray = false)
{
// Setting the third param will eliminate false end-tags. Maybe this is a good thing to do...?
$paragraphBlocks = $this->splitIntoBlock('p', $value, true);
// Returns plainly the content if there was no p sections in it
if (count($paragraphBlocks) <= 1 || $count <= 0) {
return $this->sanitizeLineBreaksForContentOnly($value);
}
// Traverse the splitted sections
foreach ($paragraphBlocks as $k => $v) {
if ($k % 2) {
// Inside a <p> section
$v = $this->removeFirstAndLastTag($v);
// Fetching 'sub-lines' - which will explode any further p nesting recursively
$subLines = $this->divideIntoLines($v, $count - 1, true);
// So, if there happened to be sub-nesting of p, this is written directly as the new content of THIS section. (This would be considered 'an error')
if (is_array($subLines)) {
$paragraphBlocks[$k] = implode(LF, $subLines);
} else {
//... but if NO subsection was found, we process it as a TRUE line without erroneous content:
$paragraphBlocks[$k] = $this->processContentWithinParagraph($subLines, $paragraphBlocks[$k]);
}
// If it turns out the line is just blank (containing a possibly) then just make it pure blank.
// But, prevent filtering of lines that are blank in sense above, but whose tags contain attributes.
// Those attributes should have been filtered before; if they are still there they must be considered as possible content.
if (trim(strip_tags($paragraphBlocks[$k])) === ' ' && !preg_match('/\\<(img)(\\s[^>]*)?\\/?>/si', $paragraphBlocks[$k]) && !preg_match('/\\<([^>]*)?( align| class| style| id| title| dir| lang| xml:lang)([^>]*)?>/si', trim($paragraphBlocks[$k]))) {
$paragraphBlocks[$k] = '';
}
} else {
// Outside a paragraph, if there is still something in there, just add a <p> tag
// Remove positions which are outside <p> tags and without content
$paragraphBlocks[$k] = trim(strip_tags($paragraphBlocks[$k], '<' . implode('><', $this->allowedTagsOutsideOfParagraphs) . '>'));
$paragraphBlocks[$k] = $this->sanitizeLineBreaksForContentOnly($paragraphBlocks[$k]);
if ((string)$paragraphBlocks[$k] === '') {
unset($paragraphBlocks[$k]);
} else {
// add <p> tags around the content
$paragraphBlocks[$k] = str_replace(strip_tags($paragraphBlocks[$k]), '<p>' . strip_tags($paragraphBlocks[$k]) . '</p>', $paragraphBlocks[$k]);
}
}
}
return $returnArray ? $paragraphBlocks : implode(LF, $paragraphBlocks);
}
/**
* Converts all lines into <p></p>-sections (unless the line has a p - tag already)
* For processing of content going FROM database TO RTE.
*
* @param string $value Value to convert
* @return string Processed value.
* @see divideIntoLines()
*/
protected function setDivTags(string $value): string
{
// First, setting configuration for the HTMLcleaner function. This will process each line between the <div>/<p> section on their way to the RTE
$keepTags = $this->getKeepTags('rte');
// Divide the content into lines
$parts = explode(LF, $value);
foreach ($parts as $k => $v) {
// Processing of line content:
// If the line is blank, set it to
if (trim($parts[$k]) === '') {
$parts[$k] = ' ';
} else {
// Clean the line content, keeping unknown tags (as they can be removed in the entryHTMLparser)
$parts[$k] = $this->HTMLcleaner($parts[$k], $keepTags, 'protect');
// convert double-encoded into regular however this could also be reversed via the exitHTMLparser
// This was previously an option to disable called "dontConvAmpInNBSP_rte"
$parts[$k] = str_replace('&nbsp;', ' ', $parts[$k]);
}
$partFirstTagName = strtolower($this->getFirstTagName($parts[$k] ?? ''));
// Wrapping the line in <p> tags if not already wrapped and does not contain an hr tag and is not allowed outside of paragraphs.
if (!in_array($partFirstTagName, $this->allowedTagsOutsideOfParagraphs, true) && !preg_match('/<(hr)(\\s[^>\\/]*)?[[:space:]]*\\/?>/i', $partFirstTagName)) {
$testStr = strtolower(trim($parts[$k]));
if (!str_starts_with($testStr, '<div') || !str_ends_with($testStr, '</div>')) {
if (!str_starts_with($testStr, '<p') || !str_ends_with($testStr, '</p>')) {
// Only set p-tags if there is not already div or p tags:
$parts[$k] = '<p>' . $parts[$k] . '</p>';
}
}
}
}
// Implode result:
return implode(LF, $parts);
}
/**
* Used for transformation from RTE to DB
*
* Works on a single line within a <p> tag when storing into the database
* This always adds <p> tags and validates the arguments,
* additionally the content is cleaned up via the HTMLcleaner.
*
* @param string $content the content within the <p> tag
* @param string $fullContentWithTag the whole <p> tag surrounded as well
*
* @return string the full <p> tag with cleaned content
*/
protected function processContentWithinParagraph(string $content, string $fullContentWithTag): string
{
// clean up the content
$content = $this->HTMLcleaner_db($content);
// Get the <p> tag, and validate the attributes
$fTag = $this->getFirstTag($fullContentWithTag);
// Check which attributes of the <p> tag to keep attributes
if (!empty($this->allowedAttributesForParagraphTags)) {
[$tagAttributes] = $this->get_tag_attributes($fTag);
// Make sure the tag attributes only contain the ones that are defined to be allowed
$tagAttributes = array_intersect_key($tagAttributes, array_flip($this->allowedAttributesForParagraphTags));
// Only allow classes that are whitelisted in $this->allowedClasses
if (isset($tagAttributes['class']) && trim($tagAttributes['class']) !== '' && !empty($this->allowedClasses) && !in_array($tagAttributes['class'], $this->allowedClasses, true)) {
$classes = GeneralUtility::trimExplode(' ', $tagAttributes['class'], true);
$classes = array_intersect($classes, $this->allowedClasses);
if (!empty($classes)) {
$tagAttributes['class'] = implode(' ', $classes);
} else {
unset($tagAttributes['class']);
}
}
} else {
$tagAttributes = [];
}
// Remove any line break
$content = str_replace(LF, '', $content);
// Compile the surrounding <p> tag
$content = '<' . rtrim('p ' . $this->compileTagAttribs($tagAttributes)) . '>' . $content . '</p>';
return $content;
}
/**
* Wrap <hr> tags with LFs, and also remove double LFs, used when transforming from RTE to DB
*
* @return string the modified content
*/
protected function sanitizeLineBreaksForContentOnly(string $content): string
{
$content = preg_replace('/<(hr)(\\s[^>\\/]*)?[[:space:]]*\\/?>/i', LF . '<$1$2/>' . LF, $content) ?? $content;
$content = str_replace(LF . LF, LF, $content);
$content = preg_replace('/(^' . LF . ')|(' . LF . '$)/i', '', $content) ?? $content;
return $content;
}
/**
* Called before any processing / transformation is made
* Removing any CRs (char 13) and only deal with LFs (char 10) internally.
* CR has a very disturbing effect, so just remove all CR and rely on LF
*
* Historical note: Previously it was possible to disable this functionality via disableUnifyLineBreaks.
*
* @param string $content the content to process
* @return string the modified content
*/
protected function streamlineLineBreaksForProcessing(string $content): string
{
return str_replace(CR, '', $content);
}
/**
* Called after any processing / transformation was made
* just before the content is returned by the RTE parser all line breaks
* get unified to be "CRLF"s again.
*
* Historical note: Previously it was possible to disable this functionality via disableUnifyLineBreaks.
*
* @param string $content the content to process
* @return string the modified content
*/
protected function streamlineLineBreaksAfterProcessing(string $content): string
{
// Make sure no \r\n sequences has entered in the meantime
$content = $this->streamlineLineBreaksForProcessing($content);
// ... and then change all \n into \r\n
return str_replace(LF, CRLF, $content);
}
/**
* Content Transformation from DB to RTE
* Checks all <a> tags which reference a t3://page and checks if the page is available
* If not, some offensive styling is added.
*
* @return string the modified content
*/
protected function markBrokenLinks(string $content): string
{
$blocks = $this->splitIntoBlock('A', $content);
$linkService = GeneralUtility::makeInstance(LinkService::class);
foreach ($blocks as $position => $value) {
if ($position % 2 === 0) {
continue;
}
[$attributes] = $this->get_tag_attributes($this->getFirstTag($value), true);
if (empty($attributes['href'])) {
continue;
}
try {
$hrefInformation = $linkService->resolve($attributes['href']);
$brokenLinkAnalysis = new BrokenLinkAnalysisEvent($hrefInformation['type'], $hrefInformation);
$this->eventDispatcher->dispatch($brokenLinkAnalysis);
if ($brokenLinkAnalysis->isBrokenLink()) {
$attributes['data-rte-error'] = $brokenLinkAnalysis->getReason();
}
} catch (InsufficientFolderAccessPermissionsException $e) {
// do nothing if user doesn't have access to the file/folder
} catch (UnknownLinkHandlerException $e) {
$attributes['data-rte-error'] = $e->getMessage();
}
// Always rewrite the block to allow the nested calling even if a page is found
$blocks[$position] =
'<a ' . GeneralUtility::implodeAttributes($attributes, true, true) . '>'
. $this->markBrokenLinks($this->removeFirstAndLastTag($blocks[$position]))
. '</a>';
}
return implode('', $blocks);
}
/**
* Content Transformation from RTE to DB
* Removes link information error attributes from <a> tags that are added to broken links
*
* @param string $content the content to process
* @return string the modified content
*/
protected function removeBrokenLinkMarkers(string $content): string
{
$blocks = $this->splitIntoBlock('A', $content);
foreach ($blocks as $position => $value) {
if ($position % 2 === 0) {
continue;
}
[$attributes] = $this->get_tag_attributes($this->getFirstTag($value), true);
if (empty($attributes['href'])) {
continue;
}
// Always remove the styling again (regardless of the page was found or not)
// so the database does not contain ugly stuff
unset($attributes['data-rte-error']);
if (isset($attributes['style'])) {
$attributes['style'] = trim(str_replace('background-color: yellow; border:2px red solid; color: black;', '', $attributes['style']));
if (empty($attributes['style'])) {
unset($attributes['style']);
}
}
$blocks[$position] =
'<a ' . GeneralUtility::implodeAttributes($attributes, true, true) . '>'
. $this->removeBrokenLinkMarkers($this->removeFirstAndLastTag($blocks[$position]))
. '</a>';
}
return implode('', $blocks);
}
protected function htmlSanitize(string $content, array $configuration): string
{
$features = GeneralUtility::makeInstance(Features::class);
// either `htmlSanitize = null` or `htmlSanitize = false`
// or feature flag `security.backend.htmlSanitizeRte` is disabled
if (array_key_exists('htmlSanitize', $configuration) && empty($configuration['htmlSanitize'])
|| !$features->isFeatureEnabled('security.backend.htmlSanitizeRte')
) {
return $content;
}
$build = $configuration['htmlSanitize.']['build'] ?? 'default';
if (class_exists($build) && is_a($build, BuilderInterface::class, true)) {
$builder = GeneralUtility::makeInstance($build);
} else {
$factory = GeneralUtility::makeInstance(SanitizerBuilderFactory::class);
$builder = $factory->build($build);
}
$sanitizer = $builder->build();
$initiator = GeneralUtility::makeInstance(SanitizerInitiator::class, static::class);
return $sanitizer->sanitize($content, $initiator);
}
}