lib/classes/TextFormat.php


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289

<?php
/**
 * TextFormat.php - simple generic text markup parser
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of
 * the License, or (at your option) any later version.
 *
 * @author      Elmar Ludwig
 * @license     http://www.gnu.org/licenses/gpl-2.0.html GPL version 2
 * @category    Stud.IP
 */

/**
 * This class implements a somewhat generic text markup parser. It is
 * optimized for rules of the form:
 *
 * markup_rule : START_TAG | START_TAG markup END_TAG
 * markup : TEXT | TEXT markup_rule markup
 *
 * where START_TAG and END_TAG are defined using regular expressions.
 * All rules are applied simultaneously, i.e. the output of a markup
 * rule is not processed again by the parser. The order of the rules
 * matters, however, because the first matching expression determines
 * which markup rule is applied (in case multiple rules match at the
 * same position in the input string).
 *
 * This example adds a new markup rule for 'smile' that replaces each
 * occurrence of the string ':-)' with a corresponding image tag:
 *
 * $markup->addMarkup('smile', ':-\)', NULL,
 *     function($markup) {
 *         return '<img src="smile.png">';
 *     }
 * );
 *
 * This example adds markup for the BBCode '[b]...[/b]' construct:
 *
 * $markup->addMarkup('bold', '\[b\]', '\[\/b\]',
 *     function($markup, $matches, $contents) {
 *         return '<b>' . $contents . '</b>';
 *     }
 * );
*/
class TextFormat
{
    private $markup_rules;
    private $start_regexp;
    private $rule_stack;

    /**
     * Initializes a new TextFormat instance with an initial set of
     * markup rules.
     *
     * @param array $markup_rules   list of markup rules
     */
    public function __construct($markup_rules = [])
    {
        $this->markup_rules = $markup_rules;
        $this->start_regexp = NULL;
        $this->rule_stack = [];
    }

    /**
     * Adds a new markup rule to this TextFormat instance. This can
     * also be used to replace an existing markup rule. The end regular
     * expression is optional (i.e. may be NULL) to indicate that this
     * rule has an empty content model. The callback is called whenever
     * the rule matches and is passed the following arguments:
     *
     * - $markup    the markup parser object
     * - $matches   match results of preg_match for $start
     * - $contents  (parsed) contents of this markup rule
     *
     * Sometimes you may want your rule to apply before another specific rule
     * will apply. For this case the parameter $before defines a rulename of
     * existing markup, before which your rule should apply.
     *
     * @param string $name      name of this rule
     * @param string $start     start regular expression
     * @param string $end       end regular expression (optional)
     * @param callback $callback function generating output of this rule
     * @param string $before mark before which rule this rule should be appended
     */
    public function addMarkup($name, $start, $end, $callback, $before = null)
    {
        $inserted = false;
        foreach ($this->markup_rules as $rule_name => $rule) {
            if ($rule_name === $before) {
                $this->markup_rules[$name] = compact('start', 'end', 'callback');
                $inserted = true;
            }
            if ($inserted) {
                unset($this->markup_rules[$rule_name]);
                $this->markup_rules[$rule_name] = $rule;
            }
        }
        if (!$inserted) {
            $this->markup_rules[$name] = compact('start', 'end', 'callback');
        }
        $this->start_regexp = NULL;
    }

    /**
     * Returns a single markup-rule if it exists.
     * @return array: array('start' => "...", 'end' => "...", 'callback' => "...")
     */
    public function getMarkup($name) {
        return $this->markup_rules[$name];
    }

    /**
     * Removes a markup rule from this TextFormat instance.
     *
     * @param string $name      name of the rule
     */
    public function removeMarkup($name)
    {
        unset($this->markup_rules[$name]);
        $this->start_regexp = NULL;
    }

    /**
     * Returns the regular expression used to split the input text
     * into individual tokens. This expression is constructed from
     * the start and end expressions of all markup rules.
     *
     * @return string   regular expression for use by the tokenizer
     */
    private function getTokenRegexp()
    {
        if ($this->start_regexp === NULL && count($this->markup_rules)) {
            foreach ($this->markup_rules as $rule) {
                $tags[] = $rule['start'];

                if (isset($rule['end'])) {
                    $tags[] = $rule['end'];
                }
            }

            $tags = array_unique($tags);
            $regexp = preg_replace('/(?<!\\\\)(\\\\\\\\)*\((?!\?)/', '$0?:', join('|', $tags));
            $this->start_regexp = '/(' . $regexp . ')/msu';
        }

        return $this->start_regexp;
    }

    /**
     * Applies the markup rules to the input text and returns the result.
     *
     * @param string $text      string to format
     *
     * @return string   formatted text
     */
    public function format($text)
    {
        $pattern = $this->getTokenRegexp();
        $options = PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE;

        if (isset($pattern)) {
            $parts = preg_split($pattern, $text, -1, $options);
            if (!is_array($parts)) {
                $last_error = error_get_last();
                throw new Exception(__CLASS__ . ': ' . $last_error['message']);
            }
            array_unshift($parts, NULL);
        } else {
            $parts = [NULL, [$text, 0]];
        }

        return $this->formatParts($text, $parts);
    }

    /**
     * Quotes the input text in a way appropriate for the output format,
     * but does not apply any markup rules. This could involve escaping
     * special characters (similar to htmlentities) or other processing.
     *
     * The default implementation in this class does nothing.
     *
     * @param string $text      string to quote
     *
     * @return string   quoted text
     */
    public function quote($text)
    {
        return $text;
    }

    /**
     * Internal method used by format() to apply markup rules to the
     * individual tokens of the input string. $open_rule indicates
     * whether a closing element (and which one) is expected.
     *
     * @param string $text      string to format
     * @param array $pars       token list of input string
     * @param array $open_rule  open markup rule, if any (may be NULL)
     *
     * @return string   formatted text
     */
    protected function formatParts($text, &$parts, $open_rule = NULL)
    {
        $part = next($parts);
        $result = $this->quote($part[0]);

        while (($part = next($parts)) !== false) {
            if (isset($open_rule)) {
                if (self::matchPart($open_rule['end'], $text, $matches, $part[1])) {
                    return $result;
                }
            }

            $matched = false;

            foreach ($this->markup_rules as $ruleKey => $rule) {
                if (self::matchPart($rule['start'], $text, $matches, $part[1])) {
                    if (isset($rule['end'])) {
                        $saved_parts = $parts;

                        array_push($this->rule_stack, $ruleKey);
                        $contents = $this->formatParts($text, $parts, $rule);
                        array_pop($this->rule_stack);

                        // skip this markup rule in case of missing closing tag
                        if (current($parts) === false) {
                            $parts = $saved_parts;
                            continue;
                        }
                    } else {
                        $contents = NULL;
                    }

                    $result .= call_user_func($rule['callback'], $this, $matches, $contents);
                    $matched = true;
                    break;
                }
            }

            if (!$matched) {
                $result .= $this->quote($part[0]);
            }

            $part = next($parts);
            $result .= $this->quote($part[0]);
        }

        return $result;
    }

    /**
     * Tries to match the given pattern against the text at a specified
     * offset. If a match is found at this position, $matches is filled
     * with the results of the search.
     *
     * @param string $pattern   regular expression
     * @param string $text      string to match against
     * @param array  $matches   result will be stored here
     * @param int    $offset    offset into $text
     *
     * @return boolen   true if the pattern matches at this offset
     */
    private static function matchPart($pattern, $text, &$matches, $offset)
    {
        $pattern = '/' . $pattern . '/msu';
        $result = preg_match($pattern, $text, $matches, PREG_OFFSET_CAPTURE, $offset);

        if ($result) {
            $match_offset = $matches[0][1];

            foreach ($matches as &$match) {
                $match = $match[0];
            }
        }

        return $result && $match_offset === $offset;
    }

    /**
     * Return true if the current markup is surrounded by another markup.
     * @param string $rule Name of the rule (it's key in markup_rules).
     * @return boolean  True if inside of $rule-markup.
     */
    public function isInsideOf($rule)
    {
        return in_array($rule, $this->rule_stack);
    }
}