aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEllis KenyƑ <elken@users.noreply.github.com>2024-03-31 04:29:39 +0100
committerGitHub <noreply@github.com>2024-03-30 20:29:39 -0700
commitb776ed96b1a980af284c8fb07a8db387c1e6c358 (patch)
treec570a6f8be6baaec003b884a1cad71cfc3fd0607
parentc7040a6f6e60abf22225df91415df76f3f739cf1 (diff)
Prevent null issues when strings are unequal (#290)
In the case where s2 is larger than s1, this errors because the index is out of range. A good testcase is the below ```lisp (apheleia--align-point " <div class=\"left-[40rem] fixed inset-y-0 right-0 z-0 hidden lg:block xl:left-[50rem]\">\n <svg\n" "<div class=\"left-[40rem] fixed inset-y-0 right-0 z-0 hidden lg:block xl:left-[50rem]\">\n <svg" 6) ``` If I've implemented the indexing wrong, do let me know but this seems to work just fine now for `mix` (the formatter that triggered this) --------- Co-authored-by: Radon Rosborough <radon@intuitiveexplanations.com>
-rw-r--r--.github/workflows/lint.yml2
-rw-r--r--.gitignore1
-rw-r--r--CHANGELOG.md5
-rw-r--r--Makefile22
-rw-r--r--apheleia-dp.el181
-rw-r--r--apheleia-rcs.el49
-rwxr-xr-xscripts/check-line-length.bash2
-rwxr-xr-xtest/shared/run-func.bash (renamed from test/formatters/run-func.bash)10
-rw-r--r--test/unit/apheleia-unit-test.el97
9 files changed, 313 insertions, 56 deletions
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 2d2340d..0c8e88a 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -22,4 +22,4 @@ jobs:
env:
VERSION: ${{ matrix.emacs_version }}
run: >-
- make docker CMD="make lint lint-changelog"
+ make docker CMD="make unit lint lint-changelog"
diff --git a/.gitignore b/.gitignore
index 9a66d3e..ad07b99 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
.log
.tmp
auto
+vendor/buttercup-*/
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 067591b..0ca6cc3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ The format is based on [Keep a Changelog].
### Enhancements
### Formatters
### Bugs fixed
+* The point alignment algorithm, which has been slightly wrong since
+ 2019, has been fixed to more correctly use dynamic programming to
+ maintain the position of point. Also, some cases of a nil pointer
+ error during point alignment have been fixed ([#290]).
* `apheleia-indent-lisp-buffer` updated to apply local variables after
calling major-mode. Also includes setting for `indent-tabs-mode` ([#286]).
* [Formatter scripts](scripts/formatters) will now work on Windows if Emacs
@@ -14,6 +18,7 @@ The format is based on [Keep a Changelog].
[#286]: https://github.com/radian-software/apheleia/pull/286
[#285]: https://github.com/radian-software/apheleia/issues/285
+[#290]: https://github.com/radian-software/apheleia/pull/290
## 4.1 (released 2024-02-25)
### Enhancements
diff --git a/Makefile b/Makefile
index 1e2b7aa..ce9e944 100644
--- a/Makefile
+++ b/Makefile
@@ -10,6 +10,7 @@ TAG ?= latest
# The order is important for compilation.
for_compile := \
apheleia-utils.el \
+ apheleia-dp.el \
apheleia-formatter-context.el \
apheleia-log.el \
apheleia-formatters.el \
@@ -96,18 +97,33 @@ fmt-build-common: ## Build a Docker image with just the common base
fmt-docker: ## Start a Docker shell for testing formatters
@scripts/docker-run.bash -e FORMATTERS "apheleia-formatters:$(TAG)" "$(CMD)"
+APHELEIA_FT := -L test/formatters -l apheleia-ft
+
.PHONY: fmt-lint
fmt-lint: ## Do basic linting for formatter configuration
- @test/formatters/run-func.bash apheleia-ft-lint
+ @test/shared/run-func.bash apheleia-ft-lint $(APHELEIA_FT)
.PHONY: fmt-check
fmt-changed: ## Get list of changed formatters on this PR
- @test/formatters/run-func.bash apheleia-ft-changed
+ @test/shared/run-func.bash apheleia-ft-changed $(APHELEIA_FT)
.PHONY: fmt-test # env var: FORMATTERS
fmt-test: ## Actually run formatter tests
- @test/formatters/run-func.bash apheleia-ft-test
+ @test/shared/run-func.bash apheleia-ft-test $(APHELEIA_FT)
.PHONY: lint-changelog
lint-changelog: ## Report an error if the changelog wasn't updated
@scripts/lint-changelog.bash
+
+BUTTERCUP_VER := 1.34
+BUTTERCUP := vendor/buttercup-$(BUTTERCUP_VER)
+
+$(BUTTERCUP):
+ @rm -rf $(BUTTERCUP) && mkdir -p $(BUTTERCUP)
+ @curl -fsSL https://github.com/jorgenschaefer/emacs-buttercup/archive/refs/tags/v$(BUTTERCUP_VER).tar.gz -o $(BUTTERCUP).tar.gz
+ @tar -xf $(BUTTERCUP).tar.gz --strip-components=1 -C $(BUTTERCUP)
+ @rm $(BUTTERCUP).tar.gz
+
+.PHONY: unit
+unit: $(BUTTERCUP) ## Run unit tests
+ @$(BUTTERCUP)/bin/buttercup test/unit -L $(BUTTERCUP) -L .
diff --git a/apheleia-dp.el b/apheleia-dp.el
new file mode 100644
index 0000000..5a7d9fb
--- /dev/null
+++ b/apheleia-dp.el
@@ -0,0 +1,181 @@
+;;; apheleia-dp.el --- Dynamic programming -*- lexical-binding: t -*-
+
+;; SPDX-License-Identifier: MIT
+
+;;; Commentary:
+
+;; The dynamic programming implementation for edit distance.
+
+;;; Code:
+
+;; I wrote this code in 2019 and completely forgot how it worked by
+;; 2024. So, for the next person who has to touch it, here is an
+;; explanation.
+;;
+;; This is essentially the standard dynamic programming algorithm for
+;; string alignment, which is the same thing as edit distance, which
+;; is to say Levenshtein distance, that you learn in Algorithms 101.
+;;
+;; Step 1 is we take two strings and compute the edit distance between
+;; them. This is the minimum sequence of insertions, deletions, and
+;; substitutions (changing one character for another) that you can use
+;; to transform the first string into the second, or vice versa.
+;;
+;; As an example suppose that we have the string "HEL|LO" where point
+;; is represented by the pipe character. The formatter changes the
+;; string to "HEO". We want to figure out where to put point. We want
+;; to obtain the result "HE|O" as this will seem intuitive to the
+;; user.
+;;
+;; For step 1 we determine that the edit distance is 2 because you can
+;; transform "HELLO" into "HEO" by two deletions (deleting the two
+;; Ls).
+;;
+;; To implement step 1 we use dynamic programming. Specifically we
+;; construct a six-by-four table (because the lengths of the strings
+;; are five and three, respectively, and we add one to each
+;; dimension). Each entry in the table will show the edit distance
+;; between two substrings, specifically column i1 and row i2 is the
+;; edit distance between the first i1 chars of the first string and
+;; the first i2 chars of the second string. Zero-indexed.
+;;
+;; We start out by filling in the first row and column. The upper-left
+;; cell is zero because it is the edit distance from the empty string
+;; to itself. The remainder of the first row and column are increasing
+;; integers because they are the edit distance from the empty string
+;; to a string of length n, or vice versa, which is always n
+;; insertions or deletions.
+;;
+;; Then the dynamic programming part is filling in the rest of the
+;; table based on previously computed entries. For each cell we have
+;; the following choices:
+;;
+;; 1. Add one to the value from the cell above. Thinking about the
+;; prefixes, this corresponds to inserting a char from the second
+;; string, which is an insertion in terms of edit distance.
+;;
+;; 2. Add one to the cell on the left. This corresponds to inserting a
+;; char from the first string, which is a deletion in terms of edit
+;; distance.
+;;
+;; 3. Add one to the cell diagonally above and to the left. This
+;; corresponds to a substitution in terms of edit distance. But if
+;; the chars at this row and column happen to be the same between
+;; the first and second strings, it is a substitution of the same
+;; char for itself, so not actually a substitution - thus we only
+;; add one if the chars differ.
+;;
+;; Here is the edit distance table for "HELLO" and "HEO":
+;;
+;; - H E L L O
+;; - 0 1 2 3 4 5
+;; H 1 0 1 2 3 4
+;; E 2 1 0 1 2 3
+;; O 3 2 1 1 2 2
+;;
+;; Step 2 is to take this table and convert it into the sequence of
+;; editing operations that transforms the first string to the second.
+;; Imagine that each time we fill in a cell, we draw an arrow from
+;; that cell to the cell that we used as the basis for its value (one
+;; of the 1, 2, 3 options in the list above). Starting at the
+;; lower-right cell, we can follow the arrows, which will point a
+;; unique path to the upper-left. These are the cells that the path
+;; will traverse for the example table above:
+;;
+;; - H E L L O
+;; - * 1 2 3 4 5
+;; H 1 * 1 2 3 4
+;; E 2 1 * * * 3
+;; O 3 2 1 1 2 *
+;;
+;; Tracing to the upper-left, we get this sequence of operations:
+;;
+;; 1. O - Leave alone
+;; 2. L - Delete
+;; 3. L - Delete
+;; 4. E - Leave alone
+;; 5. H - Leave alone
+;;
+;; Step 3 is to take this sequence of operations and use it to compute
+;; the correct offset within the modified string. Since we start with
+;; "HEL|LO", note that operations 1-2 will not affect point since they
+;; occur after point (and the numerical value of point is the number
+;; of chars before point, no matter how many are after it), while
+;; operations 3-5 are relevant for point. But we do need to go through
+;; operations 1-2 during step 2 to determine the correct path through
+;; the dynamic programming table.
+;;
+;; When going through steps 3-5, we may adjust point. We assume point
+;; will start at the same position in the modified string as it was in
+;; the original string, and then possibly make changes to it. In the
+;; case of a substitution or no-op, we don't move point. In the case
+;; of a deletion that occurs before point, we need to decrease point.
+;; In the case of an insertion that occurs before point, we need to
+;; increase point.
+;;
+;; For our given example, we have two no-ops and one deletion that
+;; occur in steps 3-5 before point, so "HEL|LO" becomes "HE|O" with
+;; point changing from 3 to 2 (assuming the start of the string is 0).
+;;
+;; This example is tested in the unit tests file if you want to look
+;; there to verify usage.
+
+(require 'cl-lib)
+
+(cl-defun apheleia--edit-distance-table (s1 s2)
+ "Align strings S1 and S2 for minimum edit distance.
+Return the dynamic programming table as a hash table which maps
+cons of integers (I1 . I2) to the edit distance between the first
+I1 characters of S1 and the first I2 characters of S2."
+ (let ((table (make-hash-table :test #'equal)))
+ (dotimes (i1 (1+ (length s1)))
+ (puthash (cons i1 0) i1 table))
+ (dotimes (i2 (1+ (length s2)))
+ (puthash (cons 0 i2) i2 table))
+ (dotimes (i1 (length s1))
+ ;; Iterate from 1 to length+1.
+ (cl-incf i1)
+ (dotimes (i2 (length s2))
+ (cl-incf i2)
+ (let ((ins (1+ (gethash (cons i1 (1- i2)) table)))
+ (del (1+ (gethash (cons (1- i1) i2) table)))
+ (sub (gethash (cons (1- i1) (1- i2)) table)))
+ (unless (= (aref s1 (1- i1)) (aref s2 (1- i2)))
+ (cl-incf sub))
+ (puthash (cons i1 i2) (min ins del sub) table))))
+ table))
+
+(defun apheleia--align-point (s1 s2 p1)
+ "Given strings S1 and S2 and index P1 in S1, return matching index P2 in S2.
+If S1 and S2 are the same, then P1 and P2 will also be the same.
+Otherwise, the text of S2 surrounding P2 is \"similar\" to the
+text of S1 surrounding P1."
+ (let* ((table (apheleia--edit-distance-table s1 s2))
+ (i1 (length s1))
+ (i2 (length s2))
+ (p2 p1))
+ (while (not (= i1 i2 0))
+ (let ((ins (1+ (gethash (cons i1 (1- i2)) table 9999)))
+ (del (1+ (gethash (cons (1- i1) i2) table 9999)))
+ (sub (gethash (cons (1- i1) (1- i2)) table 9999)))
+ (unless (and (> 0 i1) (> 0 i2)
+ (= (aref s1 (1- i1)) (aref s2 (1- i2))))
+ (cl-incf sub))
+ (let ((cost (min ins del sub)))
+ (cond
+ ((= cost sub)
+ (cl-decf i1)
+ (cl-decf i2))
+ ((= cost ins)
+ (cl-decf i2)
+ (when (< i1 p1)
+ (cl-incf p2)))
+ ((= cost del)
+ (cl-decf i1)
+ (when (< i1 p1)
+ (cl-decf p2)))))))
+ p2))
+
+(provide 'apheleia-dp)
+
+;;; apheleia-dp.el ends here
diff --git a/apheleia-rcs.el b/apheleia-rcs.el
index 2e10c12..e462921 100644
--- a/apheleia-rcs.el
+++ b/apheleia-rcs.el
@@ -9,59 +9,12 @@
;;; Code:
+(require 'apheleia-dp)
(require 'apheleia-log)
(require 'cl-lib)
(require 'subr-x)
-(cl-defun apheleia--edit-distance-table (s1 s2)
- "Align strings S1 and S2 for minimum edit distance.
-Return the dynamic programming table as has table which maps cons
-of integers (I1 . I2) to the edit distance between the first I1
-characters of S1 and the first I2 characters of S2."
- (let ((table (make-hash-table :test #'equal)))
- (dotimes (i1 (1+ (length s1)))
- (puthash (cons i1 0) i1 table))
- (dotimes (i2 (1+ (length s2)))
- (puthash (cons 0 i2) i2 table))
- (dotimes (i1 (length s1))
- ;; Iterate from 1 to length+1.
- (cl-incf i1)
- (dotimes (i2 (length s2))
- (cl-incf i2)
- (let ((ins (1+ (gethash (cons i1 (1- i2)) table)))
- (del (1+ (gethash (cons (1- i1) i2) table)))
- (sub (gethash (cons (1- i1) (1- i2)) table)))
- (unless (= (aref s1 (1- i1)) (aref s2 (1- i2)))
- (cl-incf sub))
- (puthash (cons i1 i2) (min ins del sub) table))))
- table))
-
-(defun apheleia--align-point (s1 s2 p1)
- "Given strings S1 and S2 and index P1 in S1, return matching index P2 in S2.
-If S1 and S2 are the same, then P1 and P2 will also be the same.
-Otherwise, the text of S2 surrounding P2 is \"similar\" to the
-text of S1 surrounding P1."
- (let* ((table (apheleia--edit-distance-table s1 s2))
- (i1 (length s1))
- (i2 (length s2)))
- (while (> i1 p1)
- (let ((ins (1+ (gethash (cons i1 (1- i2)) table)))
- (del (1+ (gethash (cons (1- i1) i2) table)))
- (sub (gethash (cons (1- i1) (1- i2)) table)))
- (unless (= (aref s1 (1- i1)) (aref s2 (1- i2)))
- (cl-incf sub))
- (let ((cost (min ins del sub)))
- (cond
- ((= cost ins)
- (cl-decf i2))
- ((= cost del)
- (cl-decf i1))
- ((= cost sub)
- (cl-decf i1)
- (cl-decf i2))))))
- i2))
-
(defun apheleia--map-rcs-patch (func)
"Map over the RCS patch in the current buffer.
For each RCS patch command, FUNC is called with an alist that has
diff --git a/scripts/check-line-length.bash b/scripts/check-line-length.bash
index 52a419c..20f66d9 100755
--- a/scripts/check-line-length.bash
+++ b/scripts/check-line-length.bash
@@ -9,6 +9,8 @@ find=(
-name .log -prune -o
-path ./scripts/pnp-bin.js -prune -o
-path ./test/formatters -prune -o
+ -path ./test/unit -prune -o
+ -path ./vendor -prune -o
-name "*.elc" -o
-type f -print
)
diff --git a/test/formatters/run-func.bash b/test/shared/run-func.bash
index 621edde..f5d6f96 100755
--- a/test/formatters/run-func.bash
+++ b/test/shared/run-func.bash
@@ -2,11 +2,13 @@
set -euo pipefail
+func="$1"
+shift
+
# Avoid using git to get project directory, due to
# https://github.com/radian-software/apheleia/pull/89#issuecomment-1107319617
-cd "$(dirname "$0")"
-repo="$(cd ../.. && pwd)"
+cd "$(dirname "$0")/../.."
-exec emacs --batch -L "${repo}" -L . -l apheleia-ft \
- --eval "(setq debug-on-error t)" -f "$1"
+exec emacs --batch -L . "$@" \
+ --eval "(setq debug-on-error t)" -f "${func}"
diff --git a/test/unit/apheleia-unit-test.el b/test/unit/apheleia-unit-test.el
new file mode 100644
index 0000000..c5d54cc
--- /dev/null
+++ b/test/unit/apheleia-unit-test.el
@@ -0,0 +1,97 @@
+;; -*- lexical-binding: t -*-
+
+;; `apheleia-unit-tests' - unit tests using ERT.
+
+(require 'apheleia)
+
+;; Using buttercup because ert makes it really hard to write tabular
+;; tests that report enough context to debug when they fail.
+(require 'buttercup)
+
+(require 'map)
+
+(defun apheleia-unit-find-vars (form)
+ (cond
+ ((symbolp form)
+ (list form))
+ ((listp form)
+ (mapcan #'apheleia-unit-find-vars (cdr form)))))
+
+(describe "apheleia--edit-distance-table"
+ (cl-flet ((table-error
+ (before-str after-str expected-table)
+ (let* ((hash (apheleia--edit-distance-table before-str after-str))
+ (table
+ (mapcar
+ (lambda (i2)
+ (mapcar
+ (lambda (i1)
+ (gethash (cons i1 i2) hash))
+ (number-sequence 0 (length before-str))))
+ (number-sequence 0 (length after-str)))))
+ (unless (equal table expected-table)
+ table))))
+ (cl-macrolet ((testcases
+ (description &rest specs)
+ `(it ,description
+ ,@(mapcar
+ (lambda (spec)
+ `(expect
+ (table-error ,@spec)
+ :to-be nil))
+ specs))))
+ (testcases
+ "computes the example from apheleia-dp file header"
+ ("hello" "heo"
+ '((0 1 2 3 4 5)
+ (1 0 1 2 3 4)
+ (2 1 0 1 2 3)
+ (3 2 1 1 2 2)))))))
+
+(describe "apheleia--align-point"
+ (cl-flet ((alignment-error
+ (before-spec after-spec)
+ (let* ((before-pos (string-match "|" before-spec))
+ (after-pos (string-match "|" after-spec))
+ (before-str (replace-regexp-in-string "|" "" before-spec))
+ (after-str (replace-regexp-in-string "|" "" after-spec))
+ (real-after-pos (apheleia--align-point before-str after-str before-pos)))
+ (unless (= after-pos real-after-pos)
+ (concat (substring after-str 0 real-after-pos) "|"
+ (substring after-str real-after-pos))))))
+ (cl-macrolet ((testcases
+ (description &rest specs)
+ `(it ,description
+ ,@(mapcar
+ (lambda (spec)
+ (cl-destructuring-bind (before-spec after-spec) spec
+ `(expect
+ (alignment-error ,before-spec ,after-spec)
+ :to-be nil)))
+ specs))))
+ (testcases
+ "does normal alignments"
+ ("hel|lo"
+ "he|o")
+ ("hello| world"
+ "helo| word")
+ ("hello | world"
+ "hello|world"))
+ (testcases
+ "solves issue #2"
+ (" if (node.type === \"CallExpression\" && (node.callee.type === \"Import\" @@ (node.callee.type === \"Identifier\" && node.callee.name === \"require\"))) {
+ //|
+ }
+"
+ " if (
+ node.type === \"CallExpression\" &&
+ (node.callee.type === \"Import\" @@
+ (node.callee.type === \"Identifier\" && node.callee.name === \"require\"))
+ ) {
+ //|
+ }
+"))
+ (testcases
+ "solves issue #290"
+ (" | <div class=\"left-[40rem] fixed inset-y-0 right-0 z-0 hidden lg:block xl:left-[50rem]\">\n <svg\n"
+ "|<div class=\"left-[40rem] fixed inset-y-0 right-0 z-0 hidden lg:block xl:left-[50rem]\">\n <svg")))))