diff options
Diffstat (limited to 'lib/classes/Markup.php')
| -rw-r--r-- | lib/classes/Markup.php | 788 |
1 files changed, 788 insertions, 0 deletions
diff --git a/lib/classes/Markup.php b/lib/classes/Markup.php new file mode 100644 index 0000000..dc6820f --- /dev/null +++ b/lib/classes/Markup.php @@ -0,0 +1,788 @@ +<?php +/** + * Markup.php - Handling of Stud.IP- and HTML-markup. + ** + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * @category Stud.IP + * @copyright (c) 2014 Stud.IP e.V. + * @license http://www.gnu.org/licenses/gpl-2.0.html GPL version 2 + * @since File available since Release 3.0 + * @author Robert Costa <rcosta@uos.de> + */ +namespace Studip; + +require_once __DIR__ . '/htmlpurifier/HTMLPurifier_Injector_ClassifyLinks.php'; +require_once __DIR__ . '/htmlpurifier/HTMLPurifier_Injector_ClassifyTables.php'; +require_once __DIR__ . '/htmlpurifier/HTMLPurifier_Injector_LinkifyEmail.php'; +require_once __DIR__ . '/htmlpurifier/HTMLPurifier_Injector_TransformLinks.php'; +require_once __DIR__ . '/htmlpurifier/HTMLPurifier_Injector_Unlinkify.php'; + +class Markup +{ + /** + * Apply markup rules and clean the text up. + * + * @param TextFormat $markup Markup rules applied on marked-up text. + * @param string $text Marked-up text on which rules are applied. + * @param boolean $trim Trim text before applying markup rules, if TRUE. + * + * @return string HTML code computed from marked-up text. + */ + public static function apply($markup, $text, $trim) + { + return $markup->format(self::markupToHtml($text, $trim, false)); + } + + // signature for HTML entries + const HTML_MARKER = '<!--HTML-->'; + + // signature for HTML fallback entries + const HTML_MARKER_FALLBACK = '<!-- HTML: Insert text after this line only. -->'; + + // regular expression for detecting HTML signature + const HTML_MARKER_REGEXP = '/^\s*<!--\s*HTML.*?-->/i'; + + /** + * Return `true` if the WYSIWYG editor is enabled for this user. + * @deprecated since Stud.IP 5.5 + * + * @return boolean always returns `true`. + */ + public static function editorEnabled() + { + return true; + } + + /** + * Return `true` for HTML code and `false` for plain text. + * + * HTML code must either match `HTML_MARKER_REGEXP` or begin + * with '<' and end with '>' (leading and trailing whitespace + * is ignored). Everything else is considered to be plain + * text. + * + * @param string $text HTML code or plain text. + * + * @return boolean `true` for HTML code, `false` for plain text. + */ + public static function isHtml($text) + { + return self::hasHtmlMarker($text); + } + + /** + * Return `true` for Stud.IP-HTML and `false` otherwise. + * + * Stud.IP-HTML is HTML that can contain Stud.IP Markup. + * + * Stud.IP-HTML must match Stud.IP 3.2's HTML marker. + * Leading and trailing whitespace is ignored. + * + * Everything else is considered not Stud.IP-HTML. In other + * words, if it's not Stud.IP-HTML it might be everything + * from plain text to binary code. But usually it's either + * Stud.IP markup or plain HTML code, then. + * + * @param string $text Text that is or isn't Stud.IP-HTML. + * + * @return boolean `true` for Stud.IP-HTML + */ + public static function isHtmlFallback($text) + { + $text = trim($text); + + // it's not fallback if the new HTML marker is detected + if (MarkupPrivate\Text\startsWith($text, self::HTML_MARKER)) { + return false; + } + + // it's Stud.IP-HTML if Stud.IP 3.2's HTML marker is detected + if (MarkupPrivate\Text\startsWith($text, self::HTML_MARKER_FALLBACK)) { + return true; + } + + return false; + } + + /** + * Return `true` for HTML code and `false` for plain text. + * + * HTML code must start with a match for `HTML_MARKER_REGEXP`. + * + * @param string $text HTML code or plain text. + * + * @return boolean `true` for HTML code, `false` for plain text. + */ + public static function hasHtmlMarker($text) + { + return preg_match(self::HTML_MARKER_REGEXP, $text); + } + + /** + * Mark a given text as HTML code. + * + * No sanity-checking is done on the given text. It is simply + * marked up so to be identified by Markup::isHtml as HTML + * code. + * + * @param string $text The text to be marked up as HTML code. + * + * @return string The text marked up as HTML code. + */ + public static function markAsHtml($text) + { + // NOTE keep this function in sync with the JavaScript + // function markAsHtml in WyswygHtmlHead.php + if (self::hasHtmlMarker($text) || trim($text) === '') { + return $text; // marker already set, don't set twice + } + return self::HTML_MARKER . $text; + } + + /** + * Apply markup rules after running text through HTML ready. + * + * @param TextFormat $markup Markup rules applied on marked-up text. + * @param string $text Marked-up text on which rules are applied. + * @param boolean $trim Trim text before applying markup rules, if TRUE. + * + * @return string HTML code computed from marked-up text. + */ + private static function markupHtmlReady($markup, $text, $trim) + { + return str_replace("\n", '<br>', self::markupText( + $markup, self::htmlReady(self::unixEOL($text), $trim))); + } + + /** + * Convert line break to Unix format. + * + * @param string $text Text with possibly mixed line breaks (Win, Mac, Unix). + * + * @return string Text with Unix line breaks only. + */ + private static function unixEOL($text) + { + return preg_replace("/\r\n?/", "\n", $text); + } + + /** + * Apply markup rules on plain text. + * + * @param TextFormat $markup Markup rules applied on marked-up text. + * @param string $text Marked-up text on which rules are applied. + * + * @return string HTML code computed from marked-up text. + */ + private static function markupText($markup, $text) + { + return symbol($markup->format($text)); + } + + /** + * Call HTMLPurifier to create safe HTML. + * + * @param string $dirty_html Unsafe or 'uncleaned' HTML code. + * @param boolean $autoformat Apply the AutoFormat rules + * @return string Clean and safe HTML code. + */ + private static function purify($dirty_html, $autoformat = true) + { + $purifier = self::createPurifier($autoformat); + + return $purifier->purify($dirty_html); + } + + /** + * Call HTMLPurifier to filter the HTML code (if the source is detected + * to contain HTML, returns the argument unchanged otherwise). The HTML + * marker is restored afterwards, if it was present. + * + * @param string $dirty_html Unsafe or 'uncleaned' HTML code. + * @return string Clean and safe HTML code. + */ + public static function purifyHtml($html) + { + if ($html instanceof \I18NString) { + $base = self::purifyHtml($html->original()); + $lang = $html->toArray(); + + foreach ($lang as &$value) { + $value = self::purifyHtml($value); + } + + return new \I18NString($base, $lang); + } + + if (self::isHtml($html)) { + $html = self::markAsHtml(self::purify($html)); + } + + return $html; + } + + /** + * Create HTML purifier instance with Stud.IP-specific configuration. + * + * @param boolean $autoformat Apply the AutoFormat rules + * @return \HTMLPurifier A new instance of the HTML purifier. + */ + private static function createPurifier($autoformat) + { + $config = \HTMLPurifier_Config::createDefault(); + $config->set('Cache.SerializerPath', $GLOBALS['TMP_PATH']); + $config->set('Core.RemoveInvalidImg', true); + + // restrict allowed HTML tags and attributes + // + // note that changes here should also be reflected in CKEditor's + // settings!! + // + // NOTE The list could be restricted even further by allowing only + // specific values for some attributes and CSS styles, but that is not + // directly supported by HTMLPurifier and would need to be implemented + // with a filter similar to ClassifyLinks. + // + // This is a list of further restrictions that can/should be introduced + // at a later time point maybe, if possible: + // + // - always open external links in a new tab or window + // a[class="link-extern" href="..." target="_blank"] + // - only allow left margin and horizontal text alignment to be set in + // divs (NOTE maybe remove these two features completely?): + // div[style="margin-left:(40|80|...)px; text-align:(center|right|justify)"] + // - img[style] should only allow float:left or float:right + // - only allow text color and background color to be set in a span's + // style attribute (NOTE 'wiki-links' are currently set here due to + // implementation difficulties, but probably this should be + // changed...): + // span[style="color:(#000000|#800000|...); + // background-color:(#000000|#800000|...)" + // class="wiki-link"] + // - tables should always have the class "content" (it should not be + // optional and no other class should be set): + // table[class="content"] + // - table headings should have a column and/or a row scope or no scope + // at all, but nothing else: + // th[scope="(col | row)"] + // - fonts: only Stud.IP-specific fonts should be allowed + // + $config->set('HTML.Allowed', ' + a[class|href|target|rel|name|id] + audio[controls|src|height|width|style] + big + blockquote + br + caption + code[class] + div[class|style] + em + figure[class|style] + figcaption + h1 + h2 + h3 + h4 + h5 + h6 + hr + i + img[alt|src|height|width|class|style] + li + ol[reversed|start|style] + p[style] + pre[class] + span[style|class] + strong + u + ul[style] + s + small + sub + sup + table[class|style] + tbody + td[colspan|rowspan|style] + thead + th[colspan|rowspan|style|scope] + tr + tt + video[controls|src|height|width|style] + '); + + $config->set('Attr.AllowedFrameTargets', ['_blank']); + $config->set('Attr.AllowedRel', ['nofollow']); + $config->set('Attr.EnableID', true); + $config->set('Attr.AllowedClasses', [ + 'author', + 'content', + 'image', + 'image-style-side', + 'image_resized', + 'language-cpp', + 'language-css', + 'language-diff', + 'language-java', + 'language-javascript', + 'language-json', + 'language-php', + 'language-python', + 'language-ruby', + 'language-scss', + 'language-sql', + 'language-xml', + 'link-extern', + 'link-intern', + 'math-tex', + 'table', + 'usercode', + 'wiki-link' + ]); + $config->set('CSS.AllowedFonts', [ + 'serif', + 'sans-serif', + 'monospace', + 'cursive' + ]); + $config->set('CSS.AllowedProperties', [ + 'margin-left', + 'text-align', + 'width', + 'height', + 'color', + 'background-color', // needed by span, td + 'border-color', + 'border-style', + 'float', + 'border', + 'vertical-align' + ]); + $config->set('CSS.MaxImgLength', null); + + if ($autoformat) { + $config->set('AutoFormat.Linkify', true); + $config->set('AutoFormat.Custom', [ + 'ClassifyLinks', + 'ClassifyTables', + 'LinkifyEmail' + ]); + $config->set('AutoFormat.RemoveSpansWithoutAttributes', true); + } else { + $config->set('AutoFormat.Custom', ['TransformLinks']); + } + + // avoid <img src="evil_CSRF_stuff"> + $def = $config->getHTMLDefinition(true); + $img = $def->addBlankElement('img'); + $img->attr_transform_post[] + = new MarkupPrivate\Purifier\AttrTransform_Image_Source(); + + $def->addElement('audio', 'Inline', 'Flow', 'Common', [ + 'src*' => 'URI', + 'width' => 'Length', + 'height' => 'Length', + 'controls' => 'Text', // Bool triggers bug in HTMLPurifier + ]); + + $def->addElement('video', 'Inline', 'Flow', 'Common', [ + 'src*' => 'URI', + 'width' => 'Length', + 'height' => 'Length', + 'controls' => 'Text', // Bool triggers bug in HTMLPurifier + ]); + + $def->addElement('figcaption', 'Inline', 'Flow', 'Common'); + $def->addElement('figure', 'Block', 'Optional: (figcaption, Flow) | (Flow, figcaption) | Flow', 'Common'); + + $def->addAttribute('ol', 'reversed', 'Bool'); + $def->addAttribute('ol', 'style', 'Text'); + $def->addAttribute('ul', 'style', 'Text'); + + return new \HTMLPurifier($config); + } + + /** + * Convert special characters to HTML entities, and clean up. + * + * @param string $text This text's special chars will be converted. + * @param boolean $trim Trim text before applying markup rules, if TRUE. + * @param boolean $br Replace newlines by <br>, if TRUE. + * @param boolean $double_encode Encode existing HTML entities, if TRUE. + * @return string The converted string. + */ + public static function htmlReady( + $text, $trim = true, $br = false, $double_encode = true + ) { + $text = htmlspecialchars($text, ENT_QUOTES, 'utf-8', $double_encode); + + if ($trim) { + $text = trim($text); + } + if ($br) { // fix newlines + $text = nl2br($text, false); + } + return $text; + } + + /** + * Prepare text for wysiwyg (if enabled), otherwise convert special + * characters using htmlReady. + * + * @param string $text The text. + * @param boolean $trim Trim text before applying markup rules, if TRUE. + * @param boolean $br Replace newlines by <br>, if TRUE and wysiwyg editor disabled. + * @param boolean $double_encode Encode existing HTML entities, if TRUE and wysiwyg editor disabled. + * @return string The converted string. + */ + public static function wysiwygReady( + $text, $trim = true, $br = false, $double_encode = true + ) { + if (self::editorEnabled()) { + $text = self::markupToHtml($text, $trim); + } + return self::htmlReady($text, $trim, $br, $double_encode); + } + + /** + * Convert Stud.IP markup (possibly mixed with HTML if fallback mode is + * enabled) to editable HTML. Pure HTML will only run through the purifier. + * + * @param string $text The text. + * @param boolean $trim Trim text before applying markup rules, if TRUE. + * @param boolean $mark Mark result text as HTML, if TRUE. + * @return string The converted string. + */ + public static function markupToHtml($text, $trim = true, $mark = true) + { + if (!trim($text)) { + return $text; + } + if (self::isHtml($text)) { + $is_fallback = self::isHtmlFallback($text); + $text = self::purify($text, false); + + if ($is_fallback) { + $text = self::markupText(new \StudipCoreFormat(), $text); + } + } else { + $text = self::markupHtmlReady(new \StudipCoreFormat(), $text, $trim); + } + + return $mark ? self::markAsHtml($text) : $text; + } + + /** + * Call HTMLPurifier to remove all HTML tags from the string (if the source + * is detected to contain HTML, returns the argument unchanged otherwise). + * + * @param string $html HTML code to filter + * @return string The converted string. + */ + public static function removeHtml($html) + { + if (self::isHtml($html)) { + $config = \HTMLPurifier_Config::createDefault(); + $config->set('Cache.SerializerPath', $GLOBALS['TMP_PATH']); + $config->set('HTML.Allowed', 'a[href],img[alt|src],br'); + $config->set('AutoFormat.Custom', ['Unlinkify']); + + $html = str_replace('</li>', '</li><br>', $html); + $html = str_replace('</ol>', '</ol><br>', $html); + $html = str_replace('</ul>', '</ul><br>', $html); + $html = str_replace('</tr>', '</tr><br>', $html); + $html = str_replace('</p>', '</p><br><br>', $html); + $html = str_replace('</div>', '</div><br><br>', $html); + + $purifier = new \HTMLPurifier($config); + $html = $purifier->purify($html); + + // Replace new lines with simple line break; twice because we don't + // want to create unneccessary white space if a <br /> is followed + // by a new line + $html = str_replace('<br />' . PHP_EOL, PHP_EOL, $html); + $html = str_replace('<br />', PHP_EOL, $html); + + $html = \decodeHTML(trim($html)); + } + + return $html; + } +} + +/** + * Members of Studip\MarkupPrivate must not be used outside of this file!! + */ + +namespace Studip\MarkupPrivate\Purifier; + +use Studip\MarkupPrivate\MediaProxy; + +/** + * Remove invalid <img src> attributes. + */ +class AttrTransform_Image_Source extends \HTMLPurifier_AttrTransform +{ + /** + * Implements abstract method of base class. + */ + function transform($attr, $config, $context) + { + try { + $attr['src'] = MediaProxy\getMediaUrl($attr['src']); + } catch (MediaProxy\InvalidInternalLinkException $e) { + // invalid internal link ==> remove <img src> attribute + $GLOBALS['msg'][] = _('Ungültige interne Medienverknüpfung entfernt: ') + . \htmlentities($e->getUrl()); + $attr['src'] = NULL; // remove <img src> attribute + } catch (MediaProxy\ExternalMediaDeniedException $e) { + $GLOBALS['msg'][] = _('Verbotene externe Medienverknüpfung entfernt: ') + . \htmlentities($e->getUrl()); + $attr['src'] = NULL; // remove <img src> attribute + } + return $attr; + } +} + +//// media proxy ////////////////////////////////////////////////////////////// + +namespace Studip\MarkupPrivate\MediaProxy; + +use Studip\MarkupPrivate\Text; + +/** + * Check if media proxy should be used and if so return the respective URL. + * + * @param string $url URL to media file. + * @return mixed URL string to media file (possibly 'proxied') + * or NULL if URL is invalid. + */ +function getMediaUrl($url) { + // even though proxied URLs shouldn't be stored in the database, the + // next line will handle those cases where they're accidentally there + $url = decodeMediaProxyUrl($url); + + // handle internal media links + if (isStudipMediaUrl($url)) { + return transformInternalIdnaLink($url); + } + if (isInternalLink($url)) { + // link is studip-internal, but not to a valid media location + throw new InvalidInternalLinkException($url); + } + + // handle external media links + $external_media = \Config::get()->LOAD_EXTERNAL_MEDIA; + if ($external_media === 'proxy' && + \Seminar_Session::is_current_session_authenticated() + ) { + // media proxy must be accessed by an internal link + return encodeMediaProxyUrl($url); + } + if ($external_media === 'allow') { + return $url; + } + throw new ExternalMediaDeniedException($url); +} + +/** + * Return media proxy URL for an unproxied URL. + * + * @params string $url Unproxied media URL. + * @return string Media proxy URL for accessing the same resource. + */ +function encodeMediaProxyUrl($url) { + return transformInternalIdnaLink( + getMediaProxyUrl() .'?url=' . \urlencode(\idna_link($url))); +} + +/** + * Extract the original URL from a media proxy URL. + * + * @param string $url The media proxy URL. + * return string The original URL. If $url does not point to the media + * proxy then this is the exact same value given by $url. + */ +function decodeMediaProxyUrl($url) { + # TODO make it work for 'url=' at any position in query + $urlpath = removeStudipDomain($url); + $proxypath = removeStudipDomain(getMediaProxyUrl()) . '?url='; + if (Text\startsWith($urlpath, $proxypath)) { + return \urldecode(Text\removePrefix($urlpath, $proxypath)); + } + return $url; +} + +/** + * Return Stud.IP's absolute media proxy URL. + */ +function getMediaProxyUrl() { + return $GLOBALS['ABSOLUTE_URI_STUDIP'] . 'dispatch.php/media_proxy'; +} + +/** + * Test if an URL points to a valid internal Stud.IP media path. + * + * @param string $url Internal Stud.IP URL. + * @returns boolean TRUE for internal media link URLs, FALSE otherwise. + */ +function isStudipMediaUrl($url) { + return isInternalLink($url) && + isStudipMediaUrlPath(getStudipRelativePath($url)); +} + +function isInternalLink($url) { + return is_internal_url(transformInternalIdnaLink($url)); +} + +//// url utilities //////////////////////////////////////////////////////////// + +/** + * Remove domain name from internal URLs. + * + * Remove scheme, domain and authentication information from internal + * Stud.IP URLs. Leave external URLs untouched. + * + * @param string $url URL from which to remove internal domain. + * @returns string URL without internal domain or the exact same + * value as $url for external URLs. + */ +function removeStudipDomain($url) { + if (!isInternalLink($url)) { + return $url; + } + $parsed_url = \parse_url(transformInternalIdnaLink($url)); + $path = isset($parsed_url['path']) ? $parsed_url['path'] : ''; + $query = isset($parsed_url['query']) ? '?' . $parsed_url['query'] : ''; + $fragment = isset($parsed_url['fragment']) ? '#' . $parsed_url['fragment'] : ''; + return $path . $query . $fragment; +} + +/** + * Return a URL's path component with the absolute Stud.IP path removed. + * + * NOTE: If the URL is not an internal Stud.IP URL, the path component will + * nevertheless be returned without issuing an error message. + * + * Example: + * >>> getStudipRelativePath('http://localhost:8080' + * . '/studip/sendfile.php?type=0&file_id=ABC123&file_name=nice.jpg') + * 'sendfile.php' + * + * @param string $url The URL from which to return the Stud.IP-relative + * path component. + * returns string Stud.IP-relative path component of $url. + */ +function getStudipRelativePath($url) { + $parsed_url = \parse_url(transformInternalIdnaLink($url)); + $parsed_studip_url = getParsedStudipUrl(); + return Text\removePrefix($parsed_url['path'], $parsed_studip_url['path']); +} + +/** + * Return an associative array containing the Stud.IP URL elements. + * + * see also: http://php.net/manual/en/function.parse-url.php + * + * @returns mixed Same values that PHP's parse_url() returns. + */ +function getParsedStudipUrl() { + return \parse_url($GLOBALS['ABSOLUTE_URI_STUDIP']); +} + +/** + * Test if path is valid for internal Stud.IP media URLs. + * + * @params string $path The path component of an URL. + * return boolean TRUE for valid media paths, FALSE otherwise. + */ +function isStudipMediaUrlPath($path) { + list($path_head) = \explode('/', $path); + $valid_paths = ['sendfile.php', 'download', 'assets', 'pictures']; + return \mb_strpos(\urldecode($path), '../') === false && \in_array($path_head, $valid_paths); +} + +/** + * Return a normalized, internal URL. + * + * @params string $url An internal URL. + * @returns string Normalized internal URL. + */ +function transformInternalIdnaLink($url) { + return \idna_link(\TransformInternalLinks($url)); +} + +//// url exceptions /////////////////////////////////////////////////////////// + +class UrlException extends \Exception +{ + private $url; + + public function __construct($url) { + parent::__construct(); + $this->url = $url; + } + + public function getUrl() + { + return $this->url; + } +} + +class InvalidInternalLinkException extends UrlException +{ +} + +class ExternalMediaDeniedException extends UrlException +{ +} + +//// string utilities ///////////////////////////////////////////////////////// + +namespace Studip\MarkupPrivate\Text; + +/** + * Test if string starts with prefix. + * + * @param string $string Tested string. + * @param string $prefix Prefix of tested string. + * + * @return boolean TRUE if string starts with prefix. + */ +function startsWith($string, $prefix) { + return \mb_substr($string, 0, \mb_strlen($prefix)) === $prefix; +} + +/** + * Test if string ends with suffix. + * + * @param string $string Tested string. + * @param string $suffix Suffix of tested string. + * + * @return boolean TRUE if string ends with suffix. + */ +function endsWith($string, $suffix) { + return \mb_substr($string, - \mb_strlen($suffix)) === $suffix; +} + +/** + * Remove prefix from string. + * + * Does not change the string if it has a different prefix. + * + * @param string $string The string that must start with the prefix. + * @param string $prefix The prefix of the string. + * + * @return string String without prefix. + */ +function removePrefix($string, $prefix) { + return startsWith($string, $prefix) ? \mb_substr($string, \mb_strlen($prefix)) : $string; +} |
