aboutsummaryrefslogtreecommitdiff
path: root/lib/classes/TextFormat.php
diff options
context:
space:
mode:
authorJan-Hendrik Willms <tleilax+github@gmail.com>2021-07-22 16:07:19 +0200
committerJan-Hendrik Willms <tleilax+github@gmail.com>2021-07-22 16:19:12 +0200
commita3da1483a9e689846179159355badfec8073dbec (patch)
tree770dcca6bdf5f6f2a11b0e7fcbbeda6919a3fc52 /lib/classes/TextFormat.php
current code from svn, revision 62608
Diffstat (limited to 'lib/classes/TextFormat.php')
-rw-r--r--lib/classes/TextFormat.php289
1 files changed, 289 insertions, 0 deletions
diff --git a/lib/classes/TextFormat.php b/lib/classes/TextFormat.php
new file mode 100644
index 0000000..de3da84
--- /dev/null
+++ b/lib/classes/TextFormat.php
@@ -0,0 +1,289 @@
+<?php
+/**
+ * TextFormat.php - simple generic text markup parser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @author Elmar Ludwig
+ * @license http://www.gnu.org/licenses/gpl-2.0.html GPL version 2
+ * @category Stud.IP
+ */
+
+/**
+ * This class implements a somewhat generic text markup parser. It is
+ * optimized for rules of the form:
+ *
+ * markup_rule : START_TAG | START_TAG markup END_TAG
+ * markup : TEXT | TEXT markup_rule markup
+ *
+ * where START_TAG and END_TAG are defined using regular expressions.
+ * All rules are applied simultaneously, i.e. the output of a markup
+ * rule is not processed again by the parser. The order of the rules
+ * matters, however, because the first matching expression determines
+ * which markup rule is applied (in case multiple rules match at the
+ * same position in the input string).
+ *
+ * This example adds a new markup rule for 'smile' that replaces each
+ * occurrence of the string ':-)' with a corresponding image tag:
+ *
+ * $markup->addMarkup('smile', ':-\)', NULL,
+ * function($markup) {
+ * return '<img src="smile.png">';
+ * }
+ * );
+ *
+ * This example adds markup for the BBCode '[b]...[/b]' construct:
+ *
+ * $markup->addMarkup('bold', '\[b\]', '\[\/b\]',
+ * function($markup, $matches, $contents) {
+ * return '<b>' . $contents . '</b>';
+ * }
+ * );
+*/
+class TextFormat
+{
+ private $markup_rules;
+ private $start_regexp;
+ private $rule_stack;
+
+ /**
+ * Initializes a new TextFormat instance with an initial set of
+ * markup rules.
+ *
+ * @param array $markup_rules list of markup rules
+ */
+ public function __construct($markup_rules = [])
+ {
+ $this->markup_rules = $markup_rules;
+ $this->start_regexp = NULL;
+ $this->rule_stack = [];
+ }
+
+ /**
+ * Adds a new markup rule to this TextFormat instance. This can
+ * also be used to replace an existing markup rule. The end regular
+ * expression is optional (i.e. may be NULL) to indicate that this
+ * rule has an empty content model. The callback is called whenever
+ * the rule matches and is passed the following arguments:
+ *
+ * - $markup the markup parser object
+ * - $matches match results of preg_match for $start
+ * - $contents (parsed) contents of this markup rule
+ *
+ * Sometimes you may want your rule to apply before another specific rule
+ * will apply. For this case the parameter $before defines a rulename of
+ * existing markup, before which your rule should apply.
+ *
+ * @param string $name name of this rule
+ * @param string $start start regular expression
+ * @param string $end end regular expression (optional)
+ * @param callback $callback function generating output of this rule
+ * @param string $before mark before which rule this rule should be appended
+ */
+ public function addMarkup($name, $start, $end, $callback, $before = null)
+ {
+ $inserted = false;
+ foreach ($this->markup_rules as $rule_name => $rule) {
+ if ($rule_name === $before) {
+ $this->markup_rules[$name] = compact('start', 'end', 'callback');
+ $inserted = true;
+ }
+ if ($inserted) {
+ unset($this->markup_rules[$rule_name]);
+ $this->markup_rules[$rule_name] = $rule;
+ }
+ }
+ if (!$inserted) {
+ $this->markup_rules[$name] = compact('start', 'end', 'callback');
+ }
+ $this->start_regexp = NULL;
+ }
+
+ /**
+ * Returns a single markup-rule if it exists.
+ * @return array: array('start' => "...", 'end' => "...", 'callback' => "...")
+ */
+ public function getMarkup($name) {
+ return $this->markup_rules[$name];
+ }
+
+ /**
+ * Removes a markup rule from this TextFormat instance.
+ *
+ * @param string $name name of the rule
+ */
+ public function removeMarkup($name)
+ {
+ unset($this->markup_rules[$name]);
+ $this->start_regexp = NULL;
+ }
+
+ /**
+ * Returns the regular expression used to split the input text
+ * into individual tokens. This expression is constructed from
+ * the start and end expressions of all markup rules.
+ *
+ * @return string regular expression for use by the tokenizer
+ */
+ private function getTokenRegexp()
+ {
+ if ($this->start_regexp === NULL && count($this->markup_rules)) {
+ foreach ($this->markup_rules as $rule) {
+ $tags[] = $rule['start'];
+
+ if (isset($rule['end'])) {
+ $tags[] = $rule['end'];
+ }
+ }
+
+ $tags = array_unique($tags);
+ $regexp = preg_replace('/(?<!\\\\)(\\\\\\\\)*\((?!\?)/', '$0?:', join('|', $tags));
+ $this->start_regexp = '/(' . $regexp . ')/msu';
+ }
+
+ return $this->start_regexp;
+ }
+
+ /**
+ * Applies the markup rules to the input text and returns the result.
+ *
+ * @param string $text string to format
+ *
+ * @return string formatted text
+ */
+ public function format($text)
+ {
+ $pattern = $this->getTokenRegexp();
+ $options = PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE;
+
+ if (isset($pattern)) {
+ $parts = preg_split($pattern, $text, -1, $options);
+ if (!is_array($parts)) {
+ $last_error = error_get_last();
+ throw new Exception(__CLASS__ . ': ' . $last_error['message']);
+ }
+ array_unshift($parts, NULL);
+ } else {
+ $parts = [NULL, [$text, 0]];
+ }
+
+ return $this->formatParts($text, $parts);
+ }
+
+ /**
+ * Quotes the input text in a way appropriate for the output format,
+ * but does not apply any markup rules. This could involve escaping
+ * special characters (similar to htmlentities) or other processing.
+ *
+ * The default implementation in this class does nothing.
+ *
+ * @param string $text string to quote
+ *
+ * @return string quoted text
+ */
+ public function quote($text)
+ {
+ return $text;
+ }
+
+ /**
+ * Internal method used by format() to apply markup rules to the
+ * individual tokens of the input string. $open_rule indicates
+ * whether a closing element (and which one) is expected.
+ *
+ * @param string $text string to format
+ * @param array $pars token list of input string
+ * @param array $open_rule open markup rule, if any (may be NULL)
+ *
+ * @return string formatted text
+ */
+ protected function formatParts($text, &$parts, $open_rule = NULL)
+ {
+ $part = next($parts);
+ $result = $this->quote($part[0]);
+
+ while (($part = next($parts)) !== false) {
+ if (isset($open_rule)) {
+ if (self::matchPart($open_rule['end'], $text, $matches, $part[1])) {
+ return $result;
+ }
+ }
+
+ $matched = false;
+
+ foreach ($this->markup_rules as $ruleKey => $rule) {
+ if (self::matchPart($rule['start'], $text, $matches, $part[1])) {
+ if (isset($rule['end'])) {
+ $saved_parts = $parts;
+
+ array_push($this->rule_stack, $ruleKey);
+ $contents = $this->formatParts($text, $parts, $rule);
+ array_pop($this->rule_stack);
+
+ // skip this markup rule in case of missing closing tag
+ if (current($parts) === false) {
+ $parts = $saved_parts;
+ continue;
+ }
+ } else {
+ $contents = NULL;
+ }
+
+ $result .= call_user_func($rule['callback'], $this, $matches, $contents);
+ $matched = true;
+ break;
+ }
+ }
+
+ if (!$matched) {
+ $result .= $this->quote($part[0]);
+ }
+
+ $part = next($parts);
+ $result .= $this->quote($part[0]);
+ }
+
+ return $result;
+ }
+
+ /**
+ * Tries to match the given pattern against the text at a specified
+ * offset. If a match is found at this position, $matches is filled
+ * with the results of the search.
+ *
+ * @param string $pattern regular expression
+ * @param string $text string to match against
+ * @param array $matches result will be stored here
+ * @param int $offset offset into $text
+ *
+ * @return boolen true if the pattern matches at this offset
+ */
+ private static function matchPart($pattern, $text, &$matches, $offset)
+ {
+ $pattern = '/' . $pattern . '/msu';
+ $result = preg_match($pattern, $text, $matches, PREG_OFFSET_CAPTURE, $offset);
+
+ if ($result) {
+ $match_offset = $matches[0][1];
+
+ foreach ($matches as &$match) {
+ $match = $match[0];
+ }
+ }
+
+ return $result && $match_offset === $offset;
+ }
+
+ /**
+ * Return true if the current markup is surrounded by another markup.
+ * @param string $rule Name of the rule (it's key in markup_rules).
+ * @return boolean True if inside of $rule-markup.
+ */
+ public function isInsideOf($rule)
+ {
+ return in_array($rule, $this->rule_stack);
+ }
+}