7 files changed, 180 insertions, 56 deletions
diff --git a/NEWS.org b/NEWS.org
index 625737f..21cf32b 100644
--- a/NEWS.org
+++ b/NEWS.org
@@ -1,3 +1,10 @@
+* Version 1.17.0
+- Add maximum output token support.  Ellama now exposes ~ellama-max-tokens~ and
+  passes per-request ~:max-tokens~ values through ~llm-make-chat-prompt~, so
+  providers can cap generated responses using their standard token limit
+  support.  The model transient menu can set or reset the max-token override,
+  and documentation and tests cover the new behavior.
+
 * Version 1.16.3
 - Sort saved session candidates newest first in ~ellama-load-session~.  Ellama
   now orders session files by modification time and exposes standard completion
diff --git a/README.org b/README.org
index fe12856..8f01f30 100644
--- a/README.org
+++ b/README.org
@@ -174,9 +174,10 @@ More sophisticated configuration example:
 - ~ellama-provider-select~: Select ellama provider.
 - ~ellama-select-model~: Change the current provider model interactively.  The
   model transient supports Ollama and OpenAI-compatible providers, including URL
-  editing for compatible APIs.  Use "Reset model fields" to clear model,
-  temperature, and context-length overrides and let the provider use its
-  defaults; reset values are shown as ~default~ in the transient.
+  editing for compatible APIs.  It can also set the maximum number of output
+  tokens.  Use "Reset model fields" to clear model, temperature, context-length,
+  and max-token overrides and let the provider use its defaults; reset values
+  are shown as ~default~ in the transient.
 - ~ellama-code-complete~: Complete selected code or code in the current buffer
   according to a provided change using Ellama.
 - ~ellama-code-add~: Generate and insert new code based on description. This
@@ -316,6 +317,8 @@ language is english.
 There are many supported providers: ~ollama~, ~open ai~, ~vertex~,
 ~GPT4All~. For more information see
 [[https://elpa.gnu.org/packages/llm.html][llm documentation]].
+- ~ellama-max-tokens~: Maximum number of tokens to generate.  If not set, use
+  the provider default.
 - ~ellama-providers~: association list of model llm providers with name as key.
 - ~ellama-spinner-enabled~: Enable spinner during text generation.
 - ~ellama-spinner-type~: Spinner type for ellama. Default type is
diff --git a/ellama-transient.el b/ellama-transient.el
index 2815939..1e76468 100644
--- a/ellama-transient.el
+++ b/ellama-transient.el
@@ -51,6 +51,7 @@
 (defvar ellama-transient-url nil)
 (defvar ellama-transient-provider nil)
 (defvar ellama--current-session-uid)
+(defvar ellama-max-tokens)
 
 (declare-function ellama-ask-image "ellama"
                   (image prompt &optional create-session &rest args))
@@ -100,6 +101,22 @@ Otherwise, prompt the user to enter a system message."
   (interactive)
   (setq ellama-transient-context-length (read-number "Enter context length: ")))
 
+(transient-define-suffix ellama-transient-set-max-tokens ()
+  "Set maximum output tokens."
+  (interactive)
+  (let* ((input (read-string
+                 "Enter max tokens (empty for default): "
+                 (when ellama-max-tokens
+                   (number-to-string ellama-max-tokens))))
+         (value (unless (string-empty-p input)
+                  (unless (string-match-p "\\`[[:digit:]]+\\'" input)
+                    (error "Max tokens must be a positive integer"))
+                  (let ((number (string-to-number input)))
+                    (unless (> number 0)
+                      (error "Max tokens must be a positive integer"))
+                    number))))
+    (setq ellama-max-tokens value)))
+
 (transient-define-suffix ellama-transient-set-host ()
   "Set host address."
   (interactive)
@@ -120,7 +137,8 @@ Otherwise, prompt the user to enter a system message."
   (interactive)
   (setq ellama-transient-model-name nil
         ellama-transient-temperature nil
-        ellama-transient-context-length nil))
+        ellama-transient-context-length nil
+        ellama-max-tokens nil))
 
 (defvar ellama-provider-list '(ellama-provider
                                ellama-coding-provider
@@ -183,6 +201,9 @@ Otherwise, prompt the user to enter a system message."
                      ellama-transient-provider))
      :transient t
      :description ellama-transient-context-length-description)
+    ("x" "Set Max Tokens" ellama-transient-set-max-tokens
+     :transient t
+     :description ellama-transient-max-tokens-description)
     ("r" "Reset model fields" ellama-transient-reset-model-fields
      :transient t)
     ("S" "Set provider" ellama-transient-set-provider
@@ -243,6 +264,11 @@ FORMAT is used for non-default VALUE."
   (ellama-transient--field-description
    "Context Length" ellama-transient-context-length "%d"))
 
+(defun ellama-transient-max-tokens-description ()
+  "Return transient max tokens description."
+  (ellama-transient--field-description
+   "Max Tokens" ellama-max-tokens "%d"))
+
 (defun ellama-transient--ollama-provider-p (provider)
   "Return non-nil when PROVIDER is an Ollama provider."
   (declare-function llm-ollama-p "ext:llm-ollama")
diff --git a/ellama.el b/ellama.el
index f858d46..f33ed33 100644
--- a/ellama.el
+++ b/ellama.el
@@ -6,7 +6,7 @@
 ;; URL: http://github.com/s-kostyaev/ellama
 ;; Keywords: help local tools
 ;; Package-Requires: ((emacs "28.1") (llm "0.24.0") (plz "0.8") (transient "0.7") (compat "29.1") (yaml "1.2.3"))
-;; Version: 1.16.3
+;; Version: 1.17.0
 ;; SPDX-License-Identifier: GPL-3.0-or-later
 ;; Created: 8th Oct 2023
 
@@ -71,6 +71,12 @@
   "Backend LLM provider."
   :type '(sexp :validate llm-standard-provider-p))
 
+(defcustom ellama-max-tokens nil
+  "Maximum number of tokens to generate.
+When nil, use the provider default."
+  :type '(choice (const :tag "Use provider default" nil)
+                 (integer :tag "Maximum tokens")))
+
 (defcustom ellama-session-remove-reasoning t
   "Remove internal reasoning from the session after ellama provide an answer.
 This can improve long-term communication with reasoning models."
@@ -2712,6 +2718,8 @@ file by default.
 
 :images FILES -- attach image FILES to the prompt when provider supports it.
 
+:max-tokens INTEGER -- maximum number of tokens to generate.
+
 :on-error ON-ERROR -- ON-ERROR a function that's called with an error message on
 failure (with BUFFER current).
 
@@ -2733,8 +2741,11 @@ failure (with BUFFER current).
                      (or (plist-get args :provider)
                          ellama-provider
                          (ellama-get-first-ollama-chat-model))))
+         (max-tokens (or (plist-get args :max-tokens)
+                         ellama-max-tokens))
          (reasoning-buffer (get-buffer-create
-                            (concat (make-temp-name "*ellama-reasoning-") "*")))
+                            (concat (make-temp-name "*ellama-reasoning-")
+                                    "*")))
          (point (or (plist-get args :point)
                     (with-current-buffer buffer (point))))
          (replace-beg (plist-get args :replace-beg))
@@ -2770,16 +2781,25 @@ failure (with BUFFER current).
                                (llm-chat-prompt-append-response
                                 (ellama-session-prompt session)
                                 prompt-content)
-                               (setf (llm-chat-prompt-tools (ellama-session-prompt session))
-                                     tools)
-                               ;; System message is part of prompt context and should not be
-                               ;; appended on each interaction.
+                               (setf
+                                (llm-chat-prompt-tools
+                                 (ellama-session-prompt session))
+                                tools
+                                (llm-chat-prompt-max-tokens
+                                 (ellama-session-prompt session))
+                                max-tokens)
+                               ;; System message is part of prompt context and
+                               ;; should not be appended on each interaction.
                                (ellama-session-prompt session))
                            (setf (ellama-session-prompt session)
-                                 (llm-make-chat-prompt prompt-content :context system
-                                                       :tools tools)))
-                       (llm-make-chat-prompt prompt-content :context system
-                                             :tools tools))))
+                                 (llm-make-chat-prompt
+                                  prompt-content :context system
+                                  :tools tools
+                                  :max-tokens max-tokens)))
+                       (llm-make-chat-prompt
+                        prompt-content :context system
+                        :tools tools
+                        :max-tokens max-tokens))))
     (when (not (eq (null replace-beg) (null replace-end)))
       (error "Specify both :replace-beg and :replace-end"))
     (with-current-buffer reasoning-buffer
@@ -3182,6 +3202,8 @@ ARGS contains keys for fine control.
 
 :images FILES -- attach image FILES to the prompt when provider supports it.
 
+:max-tokens INTEGER -- maximum number of tokens to generate.
+
 :ephemeral BOOL -- create an ephemeral session if set.
 
 :on-done ON-DONE -- ON-DONE a function that's called with
@@ -3194,6 +3216,7 @@ the full response text when the request completes (with BUFFER current)."
          (variants (mapcar #'car providers))
          (system (plist-get args :system))
          (donecb (plist-get args :on-done))
+         (max-tokens (plist-get args :max-tokens))
          (images (ellama--normalize-image-files
                   (or (plist-get args :images)
                       (plist-get args :image))))
@@ -3294,6 +3317,7 @@ the full response text when the request completes (with BUFFER current)."
            :session session
            :system system
            :images images
+           :max-tokens max-tokens
            :on-done (if donecb
                         (list 'ellama-chat-done donecb)
                       'ellama-chat-done)
diff --git a/ellama.info b/ellama.info
index f174071..0e4007e 100644
--- a/ellama.info
+++ b/ellama.info
@@ -289,8 +289,9 @@ File: ellama.info,  Node: Commands,  Next: Keymap,  Prev: Installation,  Up: Top
    • ‘ellama-select-model’: Change the current provider model
      interactively.  The model transient supports Ollama and
      OpenAI-compatible providers, including URL editing for compatible
-     APIs.  Use "Reset model fields" to clear model, temperature, and
-     context-length overrides and let the provider use its defaults;
+     APIs.  It can also set the maximum number of output tokens.  Use
+     "Reset model fields" to clear model, temperature, context-length,
+     and max-token overrides and let the provider use its defaults;
      reset values are shown as ‘default’ in the transient.
    • ‘ellama-code-complete’: Complete selected code or code in the
      current buffer according to a provided change using Ellama.
@@ -449,6 +450,8 @@ language is english.
 There are many supported providers: ‘ollama’, ‘open ai’, ‘vertex’,
 ‘GPT4All’.  For more information see llm documentation
 (https://elpa.gnu.org/packages/llm.html).
+   • ‘ellama-max-tokens’: Maximum number of tokens to generate.  If not
+     set, use the provider default.
    • ‘ellama-providers’: association list of model llm providers with
      name as key.
    • ‘ellama-spinner-enabled’: Enable spinner during text generation.
@@ -2343,44 +2346,44 @@ Tag Table:
 Node: Top1379
 Node: Installation3893
 Node: Commands8907
-Node: Keymap17466
-Node: Configuration20353
-Node: Session Compaction31766
-Node: Image Input33390
-Node: Task Tool Subagents35535
-Node: DLP for Tool Input/Output37638
-Node: SRT Filesystem Policy for Tools52540
-Node: Context Management58209
-Node: Transient Menus for Context Management59277
-Node: Managing the Context60956
-Node: Considerations61731
-Node: Minor modes62324
-Node: ellama-context-header-line-mode64312
-Node: ellama-context-header-line-global-mode65137
-Node: ellama-context-mode-line-mode65857
-Node: ellama-context-mode-line-global-mode66705
-Node: Ellama Session Header Line Mode67409
-Node: Enabling and Disabling67978
-Node: Customization68425
-Node: Ellama Session Mode Line Mode68713
-Node: Enabling and Disabling (1)69298
-Node: Customization (1)69745
-Node: Using Blueprints70039
-Node: Key Components of Ellama Blueprints70679
-Node: Creating and Managing Blueprints71286
-Node: Blueprints files72264
-Node: Variable Management72685
-Node: Keymap and Mode73138
-Node: Transient Menus74074
-Node: Running Blueprints programmatically74620
-Node: MCP Integration75207
-Node: Agent Skills76442
-Node: Directory Structure76805
-Node: Creating a Skill77832
-Node: How it works78207
-Node: Acknowledgments78598
-Node: Contributions79309
-Node: GNU Free Documentation License79695
+Node: Keymap17536
+Node: Configuration20423
+Node: Session Compaction31950
+Node: Image Input33574
+Node: Task Tool Subagents35719
+Node: DLP for Tool Input/Output37822
+Node: SRT Filesystem Policy for Tools52724
+Node: Context Management58393
+Node: Transient Menus for Context Management59461
+Node: Managing the Context61140
+Node: Considerations61915
+Node: Minor modes62508
+Node: ellama-context-header-line-mode64496
+Node: ellama-context-header-line-global-mode65321
+Node: ellama-context-mode-line-mode66041
+Node: ellama-context-mode-line-global-mode66889
+Node: Ellama Session Header Line Mode67593
+Node: Enabling and Disabling68162
+Node: Customization68609
+Node: Ellama Session Mode Line Mode68897
+Node: Enabling and Disabling (1)69482
+Node: Customization (1)69929
+Node: Using Blueprints70223
+Node: Key Components of Ellama Blueprints70863
+Node: Creating and Managing Blueprints71470
+Node: Blueprints files72448
+Node: Variable Management72869
+Node: Keymap and Mode73322
+Node: Transient Menus74258
+Node: Running Blueprints programmatically74804
+Node: MCP Integration75391
+Node: Agent Skills76626
+Node: Directory Structure76989
+Node: Creating a Skill78016
+Node: How it works78391
+Node: Acknowledgments78782
+Node: Contributions79493
+Node: GNU Free Documentation License79879
 
 End Tag Table
 
diff --git a/tests/test-ellama-transient.el b/tests/test-ellama-transient.el
index 96eb88a..81950a6 100644
--- a/tests/test-ellama-transient.el
+++ b/tests/test-ellama-transient.el
@@ -99,17 +99,36 @@
 (ert-deftest test-ellama-transient-reset-model-fields-and-descriptions ()
   (let ((ellama-transient-model-name "model")
         (ellama-transient-temperature 0.4)
-        (ellama-transient-context-length 8192))
+        (ellama-transient-context-length 8192)
+        (ellama-max-tokens 256))
     (ellama-transient-reset-model-fields)
     (should-not ellama-transient-model-name)
     (should-not ellama-transient-temperature)
     (should-not ellama-transient-context-length)
+    (should-not ellama-max-tokens)
     (should (equal (ellama-transient-model-description)
                    "Model (default)"))
     (should (equal (ellama-transient-temperature-description)
                    "Temperature (default)"))
     (should (equal (ellama-transient-context-length-description)
-                   "Context Length (default)"))))
+                   "Context Length (default)"))
+    (should (equal (ellama-transient-max-tokens-description)
+                   "Max Tokens (default)"))))
+
+(ert-deftest test-ellama-transient-set-max-tokens ()
+  (let ((ellama-max-tokens nil))
+    (cl-letf (((symbol-function 'read-string)
+               (lambda (&rest _args)
+                 "42")))
+      (ellama-transient-set-max-tokens)
+      (should (= ellama-max-tokens 42))
+      (should (equal (ellama-transient-max-tokens-description)
+                     "Max Tokens (42)")))
+    (cl-letf (((symbol-function 'read-string)
+               (lambda (&rest _args)
+                 "")))
+      (ellama-transient-set-max-tokens)
+      (should-not ellama-max-tokens))))
 
 (ert-deftest test-ellama-transient-set-model-keeps-reset-temperature ()
   (let ((ellama-transient-provider :provider)
diff --git a/tests/test-ellama.el b/tests/test-ellama.el
index e6f9c72..583ec90 100644
--- a/tests/test-ellama.el
+++ b/tests/test-ellama.el
@@ -831,6 +831,48 @@ detailed comparison to help you decide:
         (when buf
           (kill-buffer buf))))))
 
+(ert-deftest test-ellama-stream-uses-max-tokens ()
+  (let* ((provider (make-llm-fake))
+         (ellama-provider provider)
+         (ellama-max-tokens 7)
+         (ellama-response-process-method 'streaming)
+         (ellama-spinner-enabled nil)
+         (ellama-fill-paragraphs nil)
+         captured-prompt
+         done-text)
+    (cl-letf (((symbol-function 'llm-chat-streaming)
+               (lambda (_provider prompt _partial-callback response-callback
+                                  _error-callback &optional _multi-output)
+                 (setq captured-prompt prompt)
+                 (funcall response-callback '(:text "ok"))
+                 nil)))
+      (with-temp-buffer
+        (ellama-stream "test prompt"
+                       :provider provider
+                       :on-done (lambda (text) (setq done-text text)))))
+    (should (equal done-text "ok"))
+    (should (= (llm-chat-prompt-max-tokens captured-prompt) 7))))
+
+(ert-deftest test-ellama-stream-max-tokens-argument-overrides-default ()
+  (let* ((provider (make-llm-fake))
+         (ellama-provider provider)
+         (ellama-max-tokens 7)
+         (ellama-response-process-method 'streaming)
+         (ellama-spinner-enabled nil)
+         (ellama-fill-paragraphs nil)
+         captured-prompt)
+    (cl-letf (((symbol-function 'llm-chat-streaming)
+               (lambda (_provider prompt _partial-callback response-callback
+                                  _error-callback &optional _multi-output)
+                 (setq captured-prompt prompt)
+                 (funcall response-callback '(:text "ok"))
+                 nil)))
+      (with-temp-buffer
+        (ellama-stream "test prompt"
+                       :provider provider
+                       :max-tokens 2)))
+    (should (= (llm-chat-prompt-max-tokens captured-prompt) 2))))
+
 (ert-deftest test-ellama-stream-defaults-to-current-buffer-with-active-session ()
   (let* ((ellama-provider
           (make-llm-fake