;;; pyim-cstring-utils.el --- Chinese string tools for pyim. -*- lexical-binding: t; -*- ;; * Header ;; Copyright (C) 2021 Free Software Foundation, Inc. ;; Author: Feng Shu ;; Maintainer: Feng Shu ;; URL: https://github.com/tumashu/pyim ;; Keywords: convenience, Chinese, pinyin, input-method ;; This file is part of GNU Emacs. ;; GNU Emacs is free software: you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation, either version 3 of the License, or ;; (at your option) any later version. ;; GNU Emacs is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; You should have received a copy of the GNU General Public License ;; along with GNU Emacs. If not, see . ;;; Commentary: ;;; Code: ;; * 代码 :code: (require 'cl-lib) (require 'pyim-cstring) (require 'pyim-dhashcache) (defgroup pyim-cstring nil "Chinese string tools for pyim." :group 'pyim) ;; ** 中文字符串分词相关功能 (defun pyim-cstring-split-to-list (chinese-string &optional max-word-length delete-dups prefer-short-word) "一个基于 pyim 的中文分词函数。这个函数可以将中文字符串 CHINESE-STRING 分词,得到一个词条 alist,这个 alist 的元素都是列 表,其中第一个元素为分词得到的词条,第二个元素为词条相对于字符串 中的起始位置,第三个元素为结束位置。分词时,默认词条不超过6个字符, 用户可以通过 MAX-WORD-LENGTH 来自定义,但值得注意的是:这个值设置 越大,分词速度越慢。 如果 DELETE-DUPS 设置为 non-nil, 一个中文字符串只保留一种分割方式。 比如: 我爱北京天安门 => 我爱 北京 天安门 如果 PREFER-SHORT-WORD 为 non-nil, 去重的时候则优先保留较短的词。 注意事项: 1. 这个工具使用暴力匹配模式来分词,*不能检测出* pyim 词库中不存在 的中文词条。 2. 这个函数的分词速度比较慢,仅仅适用于中文短句的分词,不适用于文 章分词。根据评估,20个汉字组成的字符串需要大约0.3s, 40个汉字 消耗1s,随着字符串长度的增大消耗的时间呈几何倍数增加。" ;; 如果 pyim 词库没有加载,加载 pyim 词库,确保 `pyim-dcache-get' 可以正常运行。 (pyim-dcache-init-variables) (let (result) (dolist (string-list (pyim-cstring--substrings chinese-string max-word-length)) (let ((pinyin-list (pyim-cstring-to-pinyin (car string-list) nil "-" t))) (dolist (pinyin pinyin-list) (let ((words (pyim-dcache-get pinyin '(code2word)))) ; 忽略个人词库可以提高速度 (dolist (word words) (when (equal word (car string-list)) (push string-list result))))))) (if delete-dups ;; 判断两个词条在字符串中的位置是否冲突,如果冲突,仅保留一个。 (cl-delete-duplicates result :test (lambda (x1 x2) (let ((begin1 (nth 1 x1)) (begin2 (nth 1 x2)) (end1 (nth 2 x1)) (end2 (nth 2 x2))) (not (or (<= end1 begin2) (<= end2 begin1))))) :from-end prefer-short-word) result))) (defun pyim-cstring-split-to-string (string &optional prefer-short-word separator max-word-length) "将中文字符串 STRING 分词. 在分词的位置插入空格或者自定义分隔符 SEPERATERS,默认情况下较长的 词条优先使用,如果 PREFER-SHORT-WORD 设置为 t,则优先使用较短的词 条。默认最长词条不超过6个字符,用户可以通 MAX-WORD-LENGTH 来自定 义词条的最大长度,但值得注意的是,这个值设置越大,分词速度越慢。" (mapconcat (lambda (str) (when (> (length str) 0) (if (not (pyim-string-match-p "\\CC" str)) (pyim-cstring--split-to-string str prefer-short-word separator max-word-length) str))) (pyim-pymap-split-string string) (or separator " "))) (defun pyim-cstring--split-to-string (chinese-string &optional prefer-short-word separator max-word-length) "`pyim-cstring-split-to-string' 内部函数。" (let ((str-length (length chinese-string)) (word-list (pyim-cstring-split-to-list chinese-string max-word-length t prefer-short-word)) position-list result) ;; 提取词条相对于字符串的位置信息。 (dolist (word word-list) (push (nth 1 word) position-list) (push (nth 2 word) position-list)) ;; 将位置信息由小到大排序。 (setq position-list (cl-delete-duplicates (sort position-list #'<))) ;; 在分词的位置插入空格或者用户指定的分隔符。 (dotimes (i str-length) (when (and (> i 0) (member i position-list)) (push (or separator " ") result)) (push (substring chinese-string i (1+ i)) result)) (setq result (nreverse result)) (string-join result))) (defun pyim-cstring-split-buffer () "将一个 buffer 中的中文文章,进行分词操作。" (interactive) (message "分词开始!") (goto-char (point-min)) (while (not (eobp)) (let ((string (buffer-substring-no-properties (line-beginning-position) (line-end-position)))) (delete-region (line-beginning-position) (min (+ (line-end-position) 1) (point-max))) (insert (pyim-cstring-split-to-string string)) (insert "\n"))) (goto-char (point-min)) (message "分词完成!")) ;; ** 获取光标处中文词条的功能 (defun pyim-cstring-words-at-point (&optional end-of-point) "获取光标当前的词条列表,当 END-OF-POINT 设置为 t 时,获取光标后的 词条列表。词条列表的每一个元素都是列表,这些列表的第一个元素为词 条,第二个元素为光标处到词条头部的距离,第三个元素为光标处到词条 尾部的距离。 其工作原理是: 1. 使用 `thing-at-point' 获取当前光标处的一个字符串,一般而言:英 文会得到一个单词,中文会得到一个句子。 2. 英文单词直接返回这个单词的列表。 3. 中文句子首先用 `pyim-cstring-split-to-list' 分词,然后根据光标 在中文句子中的位置,筛选出符合要求的中文词条。得到并返回 *一个* 或者 *多个* 词条的列表。" ;; ;; 光标到词 光标到词 ;; 首的距离 尾的距离 ;; | | ;; 获取光标当前的词条列表 -> (("的词" 2 0) ("词条" 1 1)) ;; (let* ((case-fold-search t) (current-pos (point)) (current-char (if end-of-point (string (following-char)) (string (preceding-char)))) (str (thing-at-point 'word t)) (str-length (length str)) (str-boundary (bounds-of-thing-at-point 'word)) (str-beginning-pos (when str-boundary (car str-boundary))) (str-end-pos (when str-boundary (cdr str-boundary))) (str-offset (when (and str-beginning-pos str-end-pos) (if (= current-pos str-end-pos) (- str-end-pos str-beginning-pos) (- current-pos str-beginning-pos)))) str-offset-adjusted words-alist results) ;; 当字符串长度太长时, `pyim-cstring-split-to-list' ;; 的速度比较慢,这里确保待分词的字符串长度不超过10. (when (and str (not (pyim-string-match-p "\\CC" str))) (if (> str-offset 5) (progn (setq str-offset-adjusted 5) (setq str (substring str (- str-offset 5) (min (+ str-offset 5) str-length)))) (setq str-offset-adjusted str-offset) (setq str (substring str 0 (min 9 str-length))))) (cond (;; FIXME: 在光标在 buffer 开头或者行首时,直接返回 nil. ;; 也许这块应该更好的处理。 (or (bobp) (eq (point) (line-beginning-position))) nil) ((and str (not (pyim-string-match-p "\\CC" str))) (setq words-alist (pyim-cstring-split-to-list str)) (dolist (word-list words-alist) (let ((word-begin (nth 1 word-list)) (word-end (nth 2 word-list))) (if (if end-of-point (and (< str-offset-adjusted word-end) (>= str-offset-adjusted word-begin)) (and (<= str-offset-adjusted word-end) (> str-offset-adjusted word-begin))) (push (list (car word-list) (- str-offset-adjusted word-begin) ;; 例如: ("你好" 1 1) (- word-end str-offset-adjusted)) results)))) (or results (list (if end-of-point (list current-char 0 1) (list current-char 1 0))))) (str (list (list str (- current-pos str-beginning-pos) (- str-end-pos current-pos))))))) ;; ** 让 forward/backward 支持中文 (defun pyim-cstring-forward-word (&optional arg) "向前移动 ARG 英文或者中文词,向前移动时基于 *最长* 的词移动。" (interactive "P") (or arg (setq arg 1)) (dotimes (_ arg) (let* ((words (pyim-cstring-words-at-point t)) (max-length (cl-reduce #'max (cons 0 (mapcar (lambda (word) (nth 2 word)) words)))) (max-length (max (or max-length 1) 1))) (forward-char max-length)))) (defun pyim-cstring-backward-word (&optional arg) "向后移动 ARG 个英文或者中文词,向后移动时基于 *最长* 的词移动。" (interactive "P") (or arg (setq arg 1)) (dotimes (_ arg) (let* ((words (pyim-cstring-words-at-point)) (max-length (cl-reduce #'max (cons 0 (mapcar (lambda (word) (nth 1 word)) words)))) (max-length (max (or max-length 1) 1))) (backward-char max-length)))) (defalias 'pyim-cwords-at-point #'pyim-cstring-words-at-point) (defalias 'pyim-forward-word #'pyim-cstring-forward-word) (defalias 'pyim-backward-word #'pyim-cstring-backward-word) ;; * Footer (provide 'pyim-cstring-utils) ;;; pyim-cstring-utils.el ends here