CoCalc -- react_agent

GitHub Repository: kardolus/chatgpt-cli
Path: blob/main/agent/react/react_agent_test.go
³⁴³³ views
1
package react_test
2

3
import (
4
	"context"
5
	"errors"
6
	"fmt"
7
	"github.com/kardolus/chatgpt-cli/agent/core"
8
	"github.com/kardolus/chatgpt-cli/agent/react"
9
	"github.com/kardolus/chatgpt-cli/agent/types"
10
	"testing"
11
	"time"
12

13
	"github.com/golang/mock/gomock"
14
	. "github.com/onsi/gomega"
15
	"github.com/sclevine/spec"
16
	"github.com/sclevine/spec/report"
17
)
18

19
//go:generate mockgen -destination=runnermocks_test.go -package=react_test github.com/kardolus/chatgpt-cli/agent/core Runner
20
//go:generate mockgen -destination=clockmocks_test.go -package=react_test github.com/kardolus/chatgpt-cli/agent/core Clock
21
//go:generate mockgen -destination=llmmocks_test.go -package=react_test github.com/kardolus/chatgpt-cli/agent/tools LLM
22
//go:generate mockgen -destination=budgetmocks_test.go -package=react_test github.com/kardolus/chatgpt-cli/agent/core Budget
23

24
func TestUnitReAct(t *testing.T) {
25
	spec.Run(t, "Testing ReActAgent", testReActAgent, spec.Report(report.Terminal{}))
26
}
27

28
func testReActAgent(t *testing.T, when spec.G, it spec.S) {
29
	var (
30
		ctrl   *gomock.Controller
31
		llm    *MockLLM
32
		runner *MockRunner
33
		budget *MockBudget
34
		clock  *MockClock
35

36
		reactAgent *react.ReActAgent
37
		ctx        context.Context
38
		now        time.Time
39
	)
40

41
	it.Before(func() {
42
		RegisterTestingT(t)
43

44
		ctrl = gomock.NewController(t)
45
		llm = NewMockLLM(ctrl)
46
		runner = NewMockRunner(ctrl)
47
		budget = NewMockBudget(ctrl)
48
		clock = NewMockClock(ctrl)
49

50
		reactAgent = react.NewReActAgent(llm, runner, budget, clock)
51
		ctx = context.Background()
52
		now = time.Date(2026, 1, 15, 10, 0, 0, 0, time.UTC)
53

54
		clock.EXPECT().Now().Return(now).AnyTimes()
55
	})
56

57
	it.After(func() {
58
		ctrl.Finish()
59
	})
60

61
	when("LLM returns final answer immediately", func() {
62
		it("returns the answer without tool calls", func() {
63

64
			budget.EXPECT().AllowIteration(now).Return(nil)             // NEW
65
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{}) // NEW (no token limit)
66
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
67

68
			llm.EXPECT().
69
				Complete(gomock.Any(), gomock.Any()).
70
				Return(`{
71
					"thought": "The answer is simple",
72
					"action_type": "answer",
73
					"final_answer": "42"
74
				}`, 10, nil)
75

76
			budget.EXPECT().ChargeLLMTokens(10, now)
77

78
			_, err := reactAgent.RunAgentGoal(ctx, "What is the answer?")
79
			Expect(err).NotTo(HaveOccurred())
80
		})
81
	})
82

83
	when("LLM uses a shell tool then answers", func() {
84
		it("executes the tool and returns the final answer", func() {
85

86
			// Iteration 1: tool
87
			budget.EXPECT().AllowIteration(now).Return(nil)
88
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
89
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
90

91
			llm.EXPECT().
92
				Complete(gomock.Any(), gomock.Any()).
93
				Return(`{
94
					"thought": "I need to list files",
95
					"action_type": "tool",
96
					"tool": "shell",
97
					"command": "ls",
98
					"args": ["-la"]
99
				}`, 15, nil)
100

101
			budget.EXPECT().ChargeLLMTokens(15, now)
102

103
			runner.EXPECT().
104
				RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
105
				DoAndReturn(func(_ context.Context, _ types.Config, step types.Step) (types.StepResult, error) {
106
					Expect(step.Type).To(Equal(types.ToolShell))
107
					Expect(step.Command).To(Equal("ls"))
108
					Expect(step.Args).To(Equal([]string{"-la"}))
109
					return types.StepResult{
110
						Outcome:  types.OutcomeOK,
111
						Output:   "file1.txt\nfile2.txt",
112
						Duration: 100 * time.Millisecond,
113
					}, nil
114
				})
115

116
			// Iteration 2: answer
117
			budget.EXPECT().AllowIteration(now).Return(nil)
118
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
119
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
120

121
			llm.EXPECT().
122
				Complete(gomock.Any(), gomock.Any()).
123
				DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
124
					Expect(prompt).To(ContainSubstring("OBSERVATION: file1.txt"))
125
					return `{
126
						"thought": "I have the file list",
127
						"action_type": "answer",
128
						"final_answer": "There are 2 files"
129
					}`, 12, nil
130
				})
131

132
			budget.EXPECT().ChargeLLMTokens(12, now)
133

134
			_, err := reactAgent.RunAgentGoal(ctx, "How many files?")
135
			Expect(err).NotTo(HaveOccurred())
136
		})
137
	})
138

139
	when("Budget is exceeded", func() {
140
		it("returns Budget error", func() {
141

142
			budget.EXPECT().AllowIteration(now).Return(nil) // allow iteration
143
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
144
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(core.BudgetExceededError{
145
				Kind:    core.BudgetKindLLM,
146
				Limit:   5,
147
				Used:    5,
148
				Message: "LLM call Budget exceeded",
149
			})
150

151
			_, err := reactAgent.RunAgentGoal(ctx, "Do something")
152
			Expect(err).To(HaveOccurred())
153
			Expect(err.Error()).To(ContainSubstring("LLM call Budget exceeded"))
154
		})
155
	})
156

157
	when("LLM returns invalid JSON", func() {
158
		it("recovers in-band and returns a final answer", func() {
159

160
			// Iteration 1: invalid output
161
			budget.EXPECT().AllowIteration(now).Return(nil)
162
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
163
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
164

165
			llm.EXPECT().
166
				Complete(gomock.Any(), gomock.Any()).
167
				Return("not valid json", 5, nil)
168

169
			budget.EXPECT().ChargeLLMTokens(5, now)
170

171
			// Iteration 2: recovery prompt, model now returns valid answer JSON
172
			budget.EXPECT().AllowIteration(now).Return(nil)
173
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
174
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
175

176
			llm.EXPECT().
177
				Complete(gomock.Any(), gomock.Any()).
178
				DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
179
					Expect(prompt).To(ContainSubstring("ACTION_TAKEN: tool=LLM details=INVALID_RESPONSE"))
180
					Expect(prompt).To(ContainSubstring("OBSERVATION: ERROR: Your last response violated the ReAct protocol"))
181
					Expect(prompt).To(ContainSubstring("failed to locate JSON object"))
182
					Expect(prompt).To(ContainSubstring(`Raw response (truncated): "not valid json"`))
183
					return `{
184
					"thought": "ok",
185
					"action_type": "answer",
186
					"final_answer": "recovered"
187
				}`, 7, nil
188
				})
189

190
			budget.EXPECT().ChargeLLMTokens(7, now)
191

192
			res, err := reactAgent.RunAgentGoal(ctx, "Do something")
193
			Expect(err).NotTo(HaveOccurred())
194
			Expect(res).To(Equal("recovered"))
195
		})
196

197
		it("recovers in-band, then hard-fails after max parse recoveries", func() {
198

199
			// We expect 4 attempts total:
200
			// 1..3 => recovery continues
201
			// 4    => parseRecoveries becomes 4 (>3) => returns error
202
			for i := 0; i < 4; i++ {
203
				budget.EXPECT().AllowIteration(now).Return(nil)
204
				budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
205
				budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
206

207
				if i == 0 {
208
					llm.EXPECT().
209
						Complete(gomock.Any(), gomock.Any()).
210
						Return("not valid json", 5, nil)
211
				} else {
212
					// From attempt 2 onward, prompt should include the recovery observations.
213
					llm.EXPECT().
214
						Complete(gomock.Any(), gomock.Any()).
215
						DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
216
							Expect(prompt).To(ContainSubstring("ACTION_TAKEN: tool=LLM details=INVALID_RESPONSE"))
217
							Expect(prompt).To(ContainSubstring("OBSERVATION: ERROR: Your last response violated the ReAct protocol"))
218
							Expect(prompt).To(ContainSubstring(`"action_type"`))
219
							return "not valid json", 5, nil
220
						})
221
				}
222

223
				budget.EXPECT().ChargeLLMTokens(5, now)
224
			}
225

226
			_, err := reactAgent.RunAgentGoal(ctx, "Do something")
227
			Expect(err).To(HaveOccurred())
228
			Expect(err.Error()).To(ContainSubstring("agent failed to produce valid JSON after 3 attempts"))
229
			Expect(err.Error()).To(ContainSubstring("failed to locate JSON"))
230
		})
231
	})
232

233
	when("LLM returns JSON with missing action_type", func() {
234
		it("recovers in-band and returns a final answer", func() {
235

236
			// Iteration 1: parse succeeds, validation fails (missing action_type) -> triggers recovery path
237
			budget.EXPECT().AllowIteration(now).Return(nil)
238
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
239
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
240

241
			llm.EXPECT().
242
				Complete(gomock.Any(), gomock.Any()).
243
				Return(`{"thought":"thinking"}`, 5, nil)
244

245
			budget.EXPECT().ChargeLLMTokens(5, now)
246

247
			// Iteration 2: recovery prompt, model returns valid answer JSON
248
			budget.EXPECT().AllowIteration(now).Return(nil)
249
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
250
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
251

252
			llm.EXPECT().
253
				Complete(gomock.Any(), gomock.Any()).
254
				DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
255
					Expect(prompt).To(ContainSubstring("ACTION_TAKEN: tool=LLM details=INVALID_RESPONSE"))
256
					Expect(prompt).To(ContainSubstring("OBSERVATION: ERROR: Your last response violated the ReAct protocol"))
257
					Expect(prompt).To(ContainSubstring("missing action_type"))
258
					// raw is JSON, so the snippet should include it
259
					Expect(prompt).To(ContainSubstring(`Raw response (truncated): "{\"thought\":\"thinking\"}"`))
260
					return `{
261
					"thought": "ok",
262
					"action_type": "answer",
263
					"final_answer": "recovered"
264
				}`, 7, nil
265
				})
266

267
			budget.EXPECT().ChargeLLMTokens(7, now)
268

269
			res, err := reactAgent.RunAgentGoal(ctx, "Do something")
270
			Expect(err).NotTo(HaveOccurred())
271
			Expect(res).To(Equal("recovered"))
272
		})
273

274
		it("recovers in-band, then hard-fails after max parse recoveries", func() {
275

276
			for i := 0; i < 4; i++ {
277
				budget.EXPECT().AllowIteration(now).Return(nil)
278
				budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
279
				budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
280

281
				if i == 0 {
282
					llm.EXPECT().
283
						Complete(gomock.Any(), gomock.Any()).
284
						Return(`{"thought":"thinking"}`, 5, nil)
285
				} else {
286
					llm.EXPECT().
287
						Complete(gomock.Any(), gomock.Any()).
288
						DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
289
							Expect(prompt).To(ContainSubstring("ACTION_TAKEN: tool=LLM details=INVALID_RESPONSE"))
290
							Expect(prompt).To(ContainSubstring("OBSERVATION: ERROR: Your last response violated the ReAct protocol"))
291
							Expect(prompt).To(ContainSubstring("missing action_type"))
292
							return `{"thought":"thinking"}`, 5, nil
293
						})
294
				}
295

296
				budget.EXPECT().ChargeLLMTokens(5, now)
297
			}
298

299
			_, err := reactAgent.RunAgentGoal(ctx, "Do something")
300
			Expect(err).To(HaveOccurred())
301
			Expect(err.Error()).To(ContainSubstring("agent failed to produce valid JSON after 3 attempts"))
302
			Expect(err.Error()).To(ContainSubstring("missing action_type"))
303
		})
304
	})
305

306
	when("tool execution fails", func() {
307
		it("returns the execution error", func() {
308

309
			budget.EXPECT().AllowIteration(now).Return(nil)
310
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
311
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
312

313
			llm.EXPECT().
314
				Complete(gomock.Any(), gomock.Any()).
315
				Return(`{
316
					"thought": "running command",
317
					"action_type": "tool",
318
					"tool": "shell",
319
					"command": "false"
320
				}`, 10, nil)
321

322
			budget.EXPECT().ChargeLLMTokens(10, now)
323

324
			runner.EXPECT().
325
				RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
326
				Return(types.StepResult{
327
					Outcome:    types.OutcomeError,
328
					Transcript: "command failed",
329
				}, errors.New("exit 1"))
330

331
			_, err := reactAgent.RunAgentGoal(ctx, "Run false")
332
			Expect(err).To(HaveOccurred())
333
			Expect(err.Error()).To(ContainSubstring("exit 1"))
334
		})
335
	})
336

337
	when("iteration Budget is exceeded", func() {
338
		it("returns iteration Budget exceeded error", func() {
339

340
			// 10 successful iterations, then fail on 11th AllowIteration
341
			for i := 0; i < 10; i++ {
342
				budget.EXPECT().AllowIteration(now).Return(nil)
343
				budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
344
				budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
345

346
				llm.EXPECT().
347
					Complete(gomock.Any(), gomock.Any()).
348
					Return(fmt.Sprintf(`{
349
					"thought": "still working",
350
					"action_type": "tool",
351
					"tool": "shell",
352
					"command": "echo",
353
					"args": ["test-%d"]
354
				}`, i), 10, nil)
355

356
				budget.EXPECT().ChargeLLMTokens(10, now)
357
			}
358

359
			// We expect 10 tool executions total.
360
			runner.EXPECT().
361
				RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
362
				Times(10).
363
				Return(types.StepResult{
364
					Outcome:  types.OutcomeOK,
365
					Output:   "ok",
366
					Duration: 10 * time.Millisecond,
367
				}, nil)
368

369
			budget.EXPECT().AllowIteration(now).Return(core.BudgetExceededError{
370
				Kind:    core.BudgetKindIterations,
371
				Limit:   10,
372
				Used:    10,
373
				Message: "iteration Budget exceeded",
374
			})
375

376
			_, err := reactAgent.RunAgentGoal(ctx, "Keep looping")
377
			Expect(err).To(HaveOccurred())
378
			Expect(err.Error()).To(ContainSubstring("iteration Budget exceeded"))
379
		})
380
	})
381

382
	when("LLM output has markdown code fences", func() {
383
		it("strips the fences and parses correctly", func() {
384

385
			budget.EXPECT().AllowIteration(now).Return(nil)
386
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
387
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
388

389
			llm.EXPECT().
390
				Complete(gomock.Any(), gomock.Any()).
391
				Return("```json\n{\"thought\": \"done\", \"action_type\": \"answer\", \"final_answer\": \"Success\"}\n```", 10, nil)
392

393
			budget.EXPECT().ChargeLLMTokens(10, now)
394

395
			_, err := reactAgent.RunAgentGoal(ctx, "Test markdown")
396
			Expect(err).NotTo(HaveOccurred())
397
		})
398

399
		when("shell tool missing command", func() {
400
			it("injects error observation and lets LLM recover", func() {
401

402
				// Iteration 1: invalid shell tool (missing command)
403
				budget.EXPECT().AllowIteration(now).Return(nil)
404
				budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
405
				budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
406

407
				llm.EXPECT().
408
					Complete(gomock.Any(), gomock.Any()).
409
					Return(`{
410
				"thought": "using shell",
411
				"action_type": "tool",
412
				"tool": "shell",
413
				"command": ""
414
			}`, 10, nil)
415

416
				budget.EXPECT().ChargeLLMTokens(10, now)
417

418
				// Iteration 2: agent should surface validation error to the model as an observation,
419
				// then model answers (or chooses a different tool).
420
				budget.EXPECT().AllowIteration(now).Return(nil)
421
				budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
422
				budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
423

424
				llm.EXPECT().
425
					Complete(gomock.Any(), gomock.Any()).
426
					DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
427
						Expect(prompt).To(ContainSubstring("OBSERVATION: ERROR"))
428
						Expect(prompt).To(ContainSubstring("shell tool requires command"))
429
						return `{
430
					"thought": "ok",
431
					"action_type": "answer",
432
					"final_answer": "cannot run shell without a command"
433
				}`, 1, nil
434
					})
435

436
				budget.EXPECT().ChargeLLMTokens(1, now)
437

438
				res, err := reactAgent.RunAgentGoal(ctx, "Test")
439
				Expect(err).NotTo(HaveOccurred())
440
				Expect(res).To(Equal("cannot run shell without a command"))
441
			})
442
		})
443
	})
444

445
	when("LLM uses shorthand action_type like file/shell/LLM", func() {
446
		it("treats action_type=file as a tool call", func() {
447

448
			budget.EXPECT().AllowIteration(now).Return(nil)
449
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
450
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
451

452
			// NOTE: action_type is "file" (shorthand). tool is omitted.
453
			llm.EXPECT().
454
				Complete(gomock.Any(), gomock.Any()).
455
				Return(`{
456
				"thought": "read it",
457
				"action_type": "file",
458
				"op": "read",
459
				"path": "AGENTS.md"
460
			}`, 10, nil)
461

462
			budget.EXPECT().ChargeLLMTokens(10, now)
463

464
			runner.EXPECT().
465
				RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
466
				DoAndReturn(func(_ context.Context, _ types.Config, step types.Step) (types.StepResult, error) {
467
					Expect(step.Type).To(Equal(types.ToolFiles))
468
					Expect(step.Op).To(Equal("read"))
469
					Expect(step.Path).To(Equal("AGENTS.md"))
470
					return types.StepResult{
471
						Outcome:  types.OutcomeOK,
472
						Output:   "ok",
473
						Duration: 1 * time.Millisecond,
474
					}, nil
475
				})
476

477
			// Next iteration: answer
478
			budget.EXPECT().AllowIteration(now).Return(nil)
479
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
480
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
481

482
			llm.EXPECT().
483
				Complete(gomock.Any(), gomock.Any()).
484
				Return(`{
485
				"thought": "done",
486
				"action_type": "answer",
487
				"final_answer": "ok"
488
			}`, 5, nil)
489

490
			budget.EXPECT().ChargeLLMTokens(5, now)
491

492
			_, err := reactAgent.RunAgentGoal(ctx, "Read AGENTS")
493
			Expect(err).NotTo(HaveOccurred())
494
		})
495
	})
496

497
	when("LLM returns multiple JSON objects back-to-back", func() {
498
		it("parses only the first JSON object", func() {
499

500
			budget.EXPECT().AllowIteration(now).Return(nil)
501
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
502
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
503

504
			// Two JSON objects concatenated. parseReActResponse should take only the first.
505
			llm.EXPECT().
506
				Complete(gomock.Any(), gomock.Any()).
507
				Return(
508
					`{"thought":"one","action_type":"answer","final_answer":"A"}{"thought":"two","action_type":"answer","final_answer":"B"}`,
509
					10,
510
					nil,
511
				)
512

513
			budget.EXPECT().ChargeLLMTokens(10, now)
514

515
			res, err := reactAgent.RunAgentGoal(ctx, "Test")
516
			Expect(err).NotTo(HaveOccurred())
517
			Expect(res).To(Equal("A"))
518
		})
519
	})
520

521
	when("LLM repeats the same tool call twice in a row", func() {
522
		it("injects a repetition observation and forces a different next step", func() {
523

524
			// Iteration 1: tool
525
			budget.EXPECT().AllowIteration(now).Return(nil)
526
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
527
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
528

529
			llm.EXPECT().
530
				Complete(gomock.Any(), gomock.Any()).
531
				Return(`{
532
				"thought": "do it",
533
				"action_type": "tool",
534
				"tool": "shell",
535
				"command": "ls",
536
				"args": ["-la"]
537
			}`, 10, nil)
538

539
			budget.EXPECT().ChargeLLMTokens(10, now)
540

541
			// Only ONE tool execution should happen (iteration 1).
542
			runner.EXPECT().
543
				RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
544
				Times(1).
545
				Return(types.StepResult{
546
					Outcome:  types.OutcomeOK,
547
					Output:   "file1\nfile2\n",
548
					Duration: 1 * time.Millisecond,
549
				}, nil)
550

551
			// Iteration 2: same tool again (should be blocked by repetition guard)
552
			budget.EXPECT().AllowIteration(now).Return(nil)
553
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
554
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
555

556
			llm.EXPECT().
557
				Complete(gomock.Any(), gomock.Any()).
558
				Return(`{
559
				"thought": "try again",
560
				"action_type": "tool",
561
				"tool": "shell",
562
				"command": "ls",
563
				"args": ["-la"]
564
			}`, 10, nil)
565

566
			budget.EXPECT().ChargeLLMTokens(10, now)
567

568
			// Iteration 3: must see injected repetition message in prompt, then answer
569
			budget.EXPECT().AllowIteration(now).Return(nil)
570
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
571
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
572

573
			llm.EXPECT().
574
				Complete(gomock.Any(), gomock.Any()).
575
				DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
576
					Expect(prompt).To(ContainSubstring("OBSERVATION: You are repeating the same tool call"))
577
					return `{
578
					"thought": "ok, I'll stop repeating",
579
					"action_type": "answer",
580
					"final_answer": "done"
581
				}`, 5, nil
582
				})
583

584
			budget.EXPECT().ChargeLLMTokens(5, now)
585

586
			res, err := reactAgent.RunAgentGoal(ctx, "List files")
587
			Expect(err).NotTo(HaveOccurred())
588
			Expect(res).To(Equal("done"))
589
		})
590
	})
591

592
	when("LLM ignores repetition warnings", func() {
593
		it("hard-stops after too many repeats in the rolling window", func() {
594

595
			// We'll do 6 iterations total (1 executes, 2-5 skipped, 6 hard-stops)
596
			for i := 0; i < 6; i++ {
597
				budget.EXPECT().AllowIteration(now).Return(nil)
598
				budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
599
				budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
600

601
				llm.EXPECT().
602
					Complete(gomock.Any(), gomock.Any()).
603
					Return(`{
604
          "thought": "list files again",
605
          "action_type": "tool",
606
          "tool": "shell",
607
          "command": "ls",
608
          "args": ["-la"]
609
        }`, 1, nil)
610

611
				budget.EXPECT().ChargeLLMTokens(1, now)
612
			}
613

614
			// Only the FIRST iteration should actually execute the tool.
615
			runner.EXPECT().
616
				RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
617
				Times(1).
618
				DoAndReturn(func(_ context.Context, _ types.Config, step types.Step) (types.StepResult, error) {
619
					Expect(step.Type).To(Equal(types.ToolShell))
620
					Expect(step.Command).To(Equal("ls"))
621
					Expect(step.Args).To(Equal([]string{"-la"}))
622
					return types.StepResult{
623
						Outcome:  types.OutcomeOK,
624
						Output:   "file1\nfile2\n",
625
						Duration: 10 * time.Millisecond,
626
					}, nil
627
				})
628

629
			_, err := reactAgent.RunAgentGoal(ctx, "Loop forever")
630
			Expect(err).To(HaveOccurred())
631
			Expect(err.Error()).To(ContainSubstring("agent appears stuck"))
632
			Expect(err.Error()).To(ContainSubstring("repeated tool call too many times"))
633
		})
634
	})
635

636
	when("LLM uses shorthand action_type=file (no tool field)", func() {
637
		it("treats it as a tool call and executes file op", func() {
638

639
			// Iteration 1: shorthand file tool
640
			budget.EXPECT().AllowIteration(now).Return(nil)
641
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
642
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
643

644
			llm.EXPECT().
645
				Complete(gomock.Any(), gomock.Any()).
646
				Return(`{
647
				"thought": "read README",
648
				"action_type": "file",
649
				"op": "read",
650
				"path": "README.md"
651
			}`, 10, nil)
652

653
			budget.EXPECT().ChargeLLMTokens(10, now)
654

655
			runner.EXPECT().
656
				RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
657
				DoAndReturn(func(_ context.Context, _ types.Config, step types.Step) (types.StepResult, error) {
658
					Expect(step.Type).To(Equal(types.ToolFiles))
659
					Expect(step.Op).To(Equal("read"))
660
					Expect(step.Path).To(Equal("README.md"))
661
					return types.StepResult{
662
						Outcome:  types.OutcomeOK,
663
						Output:   "README CONTENT",
664
						Duration: 5 * time.Millisecond,
665
					}, nil
666
				})
667

668
			// Iteration 2: answer
669
			budget.EXPECT().AllowIteration(now).Return(nil)
670
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
671
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
672

673
			llm.EXPECT().
674
				Complete(gomock.Any(), gomock.Any()).
675
				DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
676
					Expect(prompt).To(ContainSubstring("OBSERVATION: README CONTENT"))
677
					return `{
678
					"thought": "done",
679
					"action_type": "answer",
680
					"final_answer": "ok"
681
				}`, 1, nil
682
				})
683

684
			budget.EXPECT().ChargeLLMTokens(1, now)
685

686
			_, err := reactAgent.RunAgentGoal(ctx, "Read README and answer")
687
			Expect(err).NotTo(HaveOccurred())
688
		})
689
	})
690

691
	when("LLM uses shorthand action_type=file AND also sets tool=file", func() {
692
		it("still treats it as a tool call (compat mode)", func() {
693

694
			// Iteration 1: shorthand but with tool field present
695
			budget.EXPECT().AllowIteration(now).Return(nil)
696
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
697
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
698

699
			llm.EXPECT().
700
				Complete(gomock.Any(), gomock.Any()).
701
				Return(`{
702
				"thought": "read AGENTS",
703
				"action_type": "file",
704
				"tool": "file",
705
				"op": "read",
706
				"path": "AGENTS.md"
707
			}`, 10, nil)
708

709
			budget.EXPECT().ChargeLLMTokens(10, now)
710

711
			runner.EXPECT().
712
				RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
713
				Return(types.StepResult{
714
					Outcome:  types.OutcomeOK,
715
					Output:   "AGENTS CONTENT",
716
					Duration: 5 * time.Millisecond,
717
				}, nil)
718

719
			// Iteration 2: answer
720
			budget.EXPECT().AllowIteration(now).Return(nil)
721
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
722
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
723

724
			llm.EXPECT().
725
				Complete(gomock.Any(), gomock.Any()).
726
				Return(`{
727
				"thought": "done",
728
				"action_type": "answer",
729
				"final_answer": "ok"
730
			}`, 1, nil)
731

732
			budget.EXPECT().ChargeLLMTokens(1, now)
733

734
			_, err := reactAgent.RunAgentGoal(ctx, "Read AGENTS and answer")
735
			Expect(err).NotTo(HaveOccurred())
736
		})
737
	})
738

739
	when("LLM uses shorthand action_type=file but tool mismatches", func() {
740
		it("recovers in-band rather than failing the whole run", func() {
741

742
			// Iteration 1: invalid action_type/tool combination -> parseReActResponse returns invalid action_type
743
			budget.EXPECT().AllowIteration(now).Return(nil)
744
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
745
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
746

747
			llm.EXPECT().
748
				Complete(gomock.Any(), gomock.Any()).
749
				Return(`{
750
				"thought": "oops",
751
				"action_type": "file",
752
				"tool": "shell",
753
				"command": "ls"
754
			}`, 10, nil)
755

756
			budget.EXPECT().ChargeLLMTokens(10, now)
757

758
			// Iteration 2: recovery prompt, model returns a valid final answer
759
			budget.EXPECT().AllowIteration(now).Return(nil)
760
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
761
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
762

763
			llm.EXPECT().
764
				Complete(gomock.Any(), gomock.Any()).
765
				DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
766
					Expect(prompt).To(ContainSubstring("ACTION_TAKEN: tool=LLM details=INVALID_RESPONSE"))
767
					Expect(prompt).To(ContainSubstring("OBSERVATION: ERROR: Your last response violated the ReAct protocol"))
768
					Expect(prompt).To(ContainSubstring(`invalid action_type: "file"`))
769
					Expect(prompt).To(ContainSubstring(`"tool": "shell"`))
770
					return `{
771
					"thought": "ack, correct schema",
772
					"action_type": "answer",
773
					"final_answer": "recovered"
774
				}`, 5, nil
775
				})
776

777
			budget.EXPECT().ChargeLLMTokens(5, now)
778

779
			res, err := reactAgent.RunAgentGoal(ctx, "Bad shorthand")
780
			Expect(err).NotTo(HaveOccurred())
781
			Expect(res).To(Equal("recovered"))
782
		})
783

784
		it("recovers in-band, then hard-fails after max parse recoveries", func() {
785

786
			// This is a validation error inside parseReActResponse: invalid action_type: "file"
787
			bad := `{
788
				"thought": "oops",
789
				"action_type": "file",
790
				"tool": "shell",
791
				"command": "ls"
792
			}`
793

794
			for i := 0; i < 4; i++ {
795
				budget.EXPECT().AllowIteration(now).Return(nil)
796
				budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
797
				budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
798

799
				if i == 0 {
800
					llm.EXPECT().
801
						Complete(gomock.Any(), gomock.Any()).
802
						Return(bad, 10, nil)
803
				} else {
804
					llm.EXPECT().
805
						Complete(gomock.Any(), gomock.Any()).
806
						DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
807
							Expect(prompt).To(ContainSubstring("ACTION_TAKEN: tool=LLM details=INVALID_RESPONSE"))
808
							Expect(prompt).To(ContainSubstring("invalid action_type"))
809
							Expect(prompt).To(ContainSubstring(`Raw response (truncated)`))
810
							return bad, 10, nil
811
						})
812
				}
813

814
				budget.EXPECT().ChargeLLMTokens(10, now)
815
			}
816

817
			_, err := reactAgent.RunAgentGoal(ctx, "Bad shorthand")
818
			Expect(err).To(HaveOccurred())
819
			Expect(err.Error()).To(ContainSubstring("agent failed to produce valid JSON after 3 attempts"))
820
			Expect(err.Error()).To(ContainSubstring(`invalid action_type: "file"`))
821
		})
822
	})
823

824
	when("LLM uses file patch", func() {
825
		it("converts to a ToolFiles step and executes it", func() {
826
			budget.EXPECT().AllowIteration(now).Return(nil)
827
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
828
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
829

830
			llm.EXPECT().Complete(gomock.Any(), gomock.Any()).Return(`{
831
      "thought":"apply diff",
832
      "action_type":"tool",
833
      "tool":"file",
834
      "op":"patch",
835
      "path":"a.txt",
836
      "data":"--- a/a.txt\n+++ b/a.txt\n@@\n+hi\n"
837
    }`, 1, nil)
838
			budget.EXPECT().ChargeLLMTokens(1, now)
839

840
			runner.EXPECT().RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
841
				DoAndReturn(func(_ context.Context, _ types.Config, step types.Step) (types.StepResult, error) {
842
					Expect(step.Type).To(Equal(types.ToolFiles))
843
					Expect(step.Op).To(Equal("patch"))
844
					Expect(step.Path).To(Equal("a.txt"))
845
					Expect(step.Data).To(ContainSubstring("+++ b/a.txt"))
846
					return types.StepResult{Outcome: types.OutcomeOK, Output: "patched"}, nil
847
				})
848

849
			// next iteration: answer
850
			budget.EXPECT().AllowIteration(now).Return(nil)
851
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
852
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
853

854
			llm.EXPECT().Complete(gomock.Any(), gomock.Any()).Return(`{
855
      "thought":"done",
856
      "action_type":"answer",
857
      "final_answer":"ok"
858
    }`, 1, nil)
859
			budget.EXPECT().ChargeLLMTokens(1, now)
860

861
			_, err := reactAgent.RunAgentGoal(ctx, "Patch a.txt")
862
			Expect(err).NotTo(HaveOccurred())
863
		})
864
	})
865

866
	when("LLM uses file replace", func() {
867
		it("converts to a ToolFiles step with Old/New/N", func() {
868
			budget.EXPECT().AllowIteration(now).Return(nil)
869
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
870
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
871

872
			llm.EXPECT().Complete(gomock.Any(), gomock.Any()).Return(`{
873
      "thought":"swap token",
874
      "action_type":"tool",
875
      "tool":"file",
876
      "op":"replace",
877
      "path":"a.txt",
878
      "old":"foo",
879
      "new":"bar",
880
      "n":2
881
    }`, 1, nil)
882
			budget.EXPECT().ChargeLLMTokens(1, now)
883

884
			runner.EXPECT().RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
885
				DoAndReturn(func(_ context.Context, _ types.Config, step types.Step) (types.StepResult, error) {
886
					Expect(step.Type).To(Equal(types.ToolFiles))
887
					Expect(step.Op).To(Equal("replace"))
888
					Expect(step.Path).To(Equal("a.txt"))
889
					Expect(step.Old).To(Equal("foo"))
890
					Expect(step.New).To(Equal("bar"))
891
					Expect(step.N).To(Equal(2))
892
					return types.StepResult{Outcome: types.OutcomeOK, Output: "replaced"}, nil
893
				})
894

895
			budget.EXPECT().AllowIteration(now).Return(nil)
896
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
897
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
898
			llm.EXPECT().Complete(gomock.Any(), gomock.Any()).Return(`{
899
      "thought":"done",
900
      "action_type":"answer",
901
      "final_answer":"ok"
902
    }`, 1, nil)
903
			budget.EXPECT().ChargeLLMTokens(1, now)
904

905
			_, err := reactAgent.RunAgentGoal(ctx, "Replace in a.txt")
906
			Expect(err).NotTo(HaveOccurred())
907
		})
908
	})
909

910
	when("LLM uses file patch without data", func() {
911
		it("injects error observation and lets LLM recover", func() {
912
			// Iteration 1: invalid patch
913
			budget.EXPECT().AllowIteration(now).Return(nil)
914
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
915
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
916

917
			llm.EXPECT().Complete(gomock.Any(), gomock.Any()).Return(`{
918
      "thought":"patch",
919
      "action_type":"tool",
920
      "tool":"file",
921
      "op":"patch",
922
      "path":"a.txt",
923
      "data":"   "
924
    }`, 1, nil)
925
			budget.EXPECT().ChargeLLMTokens(1, now)
926

927
			// Iteration 2: model sees error and answers
928
			budget.EXPECT().AllowIteration(now).Return(nil)
929
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
930
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
931

932
			llm.EXPECT().Complete(gomock.Any(), gomock.Any()).
933
				DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
934
					Expect(prompt).To(ContainSubstring("file patch requires data"))
935
					Expect(prompt).To(ContainSubstring("OBSERVATION: ERROR"))
936
					return `{
937
          "thought":"ok",
938
          "action_type":"answer",
939
          "final_answer":"fixed"
940
        }`, 1, nil
941
				})
942
			budget.EXPECT().ChargeLLMTokens(1, now)
943

944
			res, err := reactAgent.RunAgentGoal(ctx, "Patch")
945
			Expect(err).NotTo(HaveOccurred())
946
			Expect(res).To(Equal("fixed"))
947
		})
948
	})
949

950
	when("LLM uses file replace without old", func() {
951
		it("injects error observation and lets LLM recover", func() {
952
			// Iteration 1: invalid replace
953
			budget.EXPECT().AllowIteration(now).Return(nil)
954
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
955
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
956

957
			llm.EXPECT().Complete(gomock.Any(), gomock.Any()).Return(`{
958
      "thought":"replace",
959
      "action_type":"tool",
960
      "tool":"file",
961
      "op":"replace",
962
      "path":"a.txt",
963
      "new":""
964
    }`, 1, nil)
965
			budget.EXPECT().ChargeLLMTokens(1, now)
966

967
			// Iteration 2: recover
968
			budget.EXPECT().AllowIteration(now).Return(nil)
969
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
970
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
971

972
			llm.EXPECT().Complete(gomock.Any(), gomock.Any()).
973
				DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
974
					Expect(prompt).To(ContainSubstring("file replace requires old"))
975
					Expect(prompt).To(ContainSubstring("OBSERVATION: ERROR"))
976
					return `{
977
          "thought":"ok",
978
          "action_type":"answer",
979
          "final_answer":"recovered"
980
        }`, 1, nil
981
				})
982
			budget.EXPECT().ChargeLLMTokens(1, now)
983

984
			res, err := reactAgent.RunAgentGoal(ctx, "Replace")
985
			Expect(err).NotTo(HaveOccurred())
986
			Expect(res).To(Equal("recovered"))
987
		})
988
	})
989

990
	when("patch fails and agent falls back to full write", func() {
991
		it("continues after patch failure observation and then writes", func() {
992
			// Iteration 1: patch
993
			budget.EXPECT().AllowIteration(now).Return(nil)
994
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
995
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
996

997
			llm.EXPECT().Complete(gomock.Any(), gomock.Any()).Return(`{
998
      "thought":"try patch first",
999
      "action_type":"tool",
1000
      "tool":"file",
1001
      "op":"patch",
1002
      "path":"a.txt",
1003
      "data":"--- a/a.txt\n+++ b/a.txt\n@@\n+hi\n"
1004
    }`, 1, nil)
1005
			budget.EXPECT().ChargeLLMTokens(1, now)
1006

1007
			runner.EXPECT().RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
1008
				DoAndReturn(func(_ context.Context, _ types.Config, step types.Step) (types.StepResult, error) {
1009
					Expect(step.Op).To(Equal("patch"))
1010
					// IMPORTANT: err == nil, failure is conveyed in Output/Outcome
1011
					return types.StepResult{
1012
						Outcome:  types.OutcomeError,
1013
						Output:   "patch failed: hunk did not apply",
1014
						Duration: 1 * time.Millisecond,
1015
					}, nil
1016
				})
1017

1018
			// Iteration 2: LLM sees patch failed and chooses full write
1019
			budget.EXPECT().AllowIteration(now).Return(nil)
1020
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
1021
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
1022

1023
			llm.EXPECT().Complete(gomock.Any(), gomock.Any()).
1024
				DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
1025
					Expect(prompt).To(ContainSubstring("OBSERVATION: ERROR:"))
1026
					Expect(prompt).To(ContainSubstring("patch failed"))
1027
					return `{
1028
          "thought":"fallback to write full file",
1029
          "action_type":"tool",
1030
          "tool":"file",
1031
          "op":"write",
1032
          "path":"a.txt",
1033
          "data":"FULL NEW CONTENT\n"
1034
        }`, 1, nil
1035
				})
1036
			budget.EXPECT().ChargeLLMTokens(1, now)
1037

1038
			runner.EXPECT().RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
1039
				DoAndReturn(func(_ context.Context, _ types.Config, step types.Step) (types.StepResult, error) {
1040
					Expect(step.Op).To(Equal("write"))
1041
					Expect(step.Path).To(Equal("a.txt"))
1042
					Expect(step.Data).To(Equal("FULL NEW CONTENT\n"))
1043
					return types.StepResult{Outcome: types.OutcomeOK, Output: "wrote"}, nil
1044
				})
1045

1046
			// Iteration 3: answer
1047
			budget.EXPECT().AllowIteration(now).Return(nil)
1048
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
1049
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
1050

1051
			llm.EXPECT().Complete(gomock.Any(), gomock.Any()).Return(`{
1052
      "thought":"done",
1053
      "action_type":"answer",
1054
      "final_answer":"ok"
1055
    }`, 1, nil)
1056
			budget.EXPECT().ChargeLLMTokens(1, now)
1057

1058
			_, err := reactAgent.RunAgentGoal(ctx, "Modify a.txt")
1059
			Expect(err).NotTo(HaveOccurred())
1060
		})
1061
	})
1062

1063
	when("a step produces side effects", func() {
1064
		it("includes STATE line with cumulative effects in the next prompt", func() {
1065
			// iter 1
1066
			budget.EXPECT().AllowIteration(now).Return(nil)
1067
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
1068
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
1069

1070
			llm.EXPECT().Complete(gomock.Any(), gomock.Any()).Return(`{
1071
      "thought":"write a file",
1072
      "action_type":"tool",
1073
      "tool":"file",
1074
      "op":"write",
1075
      "path":"a.txt",
1076
      "data":"hi"
1077
    }`, 1, nil)
1078
			budget.EXPECT().ChargeLLMTokens(1, now)
1079

1080
			runner.EXPECT().RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
1081
				Return(types.StepResult{
1082
					Outcome:  types.OutcomeOK,
1083
					Output:   "wrote",
1084
					Duration: 1 * time.Millisecond,
1085
					Effects: types.Effects{
1086
						{Kind: "file.write", Path: "a.txt"},
1087
					},
1088
				}, nil)
1089

1090
			// iter 2
1091
			budget.EXPECT().AllowIteration(now).Return(nil)
1092
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
1093
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
1094

1095
			llm.EXPECT().Complete(gomock.Any(), gomock.Any()).
1096
				DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
1097
					Expect(prompt).To(ContainSubstring("State:"))
1098
					Expect(prompt).To(ContainSubstring("file.write x1"))
1099
					return `{
1100
          "thought":"done",
1101
          "action_type":"answer",
1102
          "final_answer":"ok"
1103
        }`, 1, nil
1104
				})
1105
			budget.EXPECT().ChargeLLMTokens(1, now)
1106

1107
			_, err := reactAgent.RunAgentGoal(ctx, "Write")
1108
			Expect(err).NotTo(HaveOccurred())
1109
		})
1110
	})
1111

1112
	when("patch fails and agent injects FALLBACK REQUIRED guidance", func() {
1113
		it("includes fallback-required instruction (read+write) in the next prompt", func() {
1114
			// Iteration 1: model tries patch
1115
			budget.EXPECT().AllowIteration(now).Return(nil)
1116
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
1117
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
1118

1119
			llm.EXPECT().Complete(gomock.Any(), gomock.Any()).Return(`{
1120
      "thought":"try patch first",
1121
      "action_type":"tool",
1122
      "tool":"file",
1123
      "op":"patch",
1124
      "path":"a.txt",
1125
      "data":"--- a/a.txt\n+++ b/a.txt\n@@ -1,1 +1,1 @@\n-old\n+new\n"
1126
    }`, 1, nil)
1127
			budget.EXPECT().ChargeLLMTokens(1, now)
1128

1129
			// Patch fails via OutcomeError (err == nil)
1130
			runner.EXPECT().RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
1131
				DoAndReturn(func(_ context.Context, _ types.Config, step types.Step) (types.StepResult, error) {
1132
					Expect(step.Type).To(Equal(types.ToolFiles))
1133
					Expect(step.Op).To(Equal("patch"))
1134
					Expect(step.Path).To(Equal("a.txt"))
1135
					return types.StepResult{
1136
						Outcome:  types.OutcomeError,
1137
						Output:   "invalid unified diff: missing hunk header",
1138
						Duration: 1 * time.Millisecond,
1139
					}, nil
1140
				})
1141

1142
			// Iteration 2: model sees fallback-required instruction (this is what we’re testing)
1143
			budget.EXPECT().AllowIteration(now).Return(nil)
1144
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
1145
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
1146

1147
			llm.EXPECT().Complete(gomock.Any(), gomock.Any()).
1148
				DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
1149
					// Normal error observation is still present
1150
					Expect(prompt).To(ContainSubstring("OBSERVATION: ERROR:"))
1151
					Expect(prompt).To(ContainSubstring("invalid unified diff"))
1152

1153
					// New guardrail line(s)
1154
					Expect(prompt).To(ContainSubstring("OBSERVATION: FALLBACK REQUIRED"))
1155
					Expect(prompt).To(ContainSubstring(`Do NOT try op="patch" or op=patch/replace again`))
1156

1157
					// Explicit JSON skeleton that forces read next
1158
					Expect(prompt).To(ContainSubstring(`{"action_type":"tool","tool":"file","op":"read","path":"a.txt"}`))
1159

1160
					// We can stop here with an answer; no need to actually execute the fallback in this unit test.
1161
					return `{
1162
          "thought":"ack",
1163
          "action_type":"answer",
1164
          "final_answer":"ok"
1165
        }`, 1, nil
1166
				})
1167
			budget.EXPECT().ChargeLLMTokens(1, now)
1168

1169
			_, err := reactAgent.RunAgentGoal(ctx, "Modify a.txt")
1170
			Expect(err).NotTo(HaveOccurred())
1171
		})
1172
	})
1173

1174
	when("agent already has transcript + history from a previous run", func() {
1175
		it("resets them before starting a new run", func() {
1176
			// Seed previous run leftovers
1177
			reactAgent.AddTranscript("OLD_TRANSCRIPT_SHOULD_BE_CLEARED")
1178
			reactAgent.AddHistory("OLD_HISTORY_SHOULD_BE_CLEARED")
1179

1180
			budget.EXPECT().AllowIteration(now).Return(nil)
1181
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
1182
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
1183

1184
			llm.EXPECT().
1185
				Complete(gomock.Any(), gomock.Any()).
1186
				Return(`{
1187
					"thought": "ok",
1188
					"action_type": "answer",
1189
					"final_answer": "done"
1190
				}`, 3, nil)
1191

1192
			budget.EXPECT().ChargeLLMTokens(3, now)
1193

1194
			res, err := reactAgent.RunAgentGoal(ctx, "New goal")
1195
			Expect(err).NotTo(HaveOccurred())
1196
			Expect(res).To(Equal("done"))
1197

1198
			// Assert old data is gone
1199
			Expect(reactAgent.TranscriptString()).NotTo(ContainSubstring("OLD_TRANSCRIPT_SHOULD_BE_CLEARED"))
1200
			Expect(reactAgent.History()).NotTo(ContainSubstring("OLD_HISTORY_SHOULD_BE_CLEARED"))
1201

1202
			// And the new goal is present (sanity check)
1203
			Expect(reactAgent.TranscriptString()).To(ContainSubstring("[goal]"))
1204
			Expect(reactAgent.TranscriptString()).To(ContainSubstring("New goal"))
1205
			Expect(reactAgent.History()).To(ContainSubstring("USER: New goal"))
1206
		})
1207
	})
1208

1209
	when("prompt logging is enabled and transcript max is small", func() {
1210
		it("caps transcript length (and truncates) even if prompt is large", func() {
1211
			// Create an agent with a tiny transcript buffer so truncation is guaranteed.
1212
			// (We keep using the same mocks.)
1213
			agent := react.NewReActAgent(
1214
				llm, runner, budget, clock,
1215
				core.WithTranscriptMaxBytes(120),
1216
				core.WithPromptHistoryMaxBytes(120),
1217
			)
1218

1219
			budget.EXPECT().AllowIteration(now).Return(nil)
1220
			budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
1221
			budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
1222

1223
			llm.EXPECT().
1224
				Complete(gomock.Any(), gomock.Any()).
1225
				DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
1226
					// The prompt should be big (your buildReActPrompt() is huge),
1227
					// which is what makes prompt-logging risky if uncapped.
1228
					Expect(len(prompt)).To(BeNumerically(">", 200))
1229
					return `{
1230
						"thought": "ok",
1231
						"action_type": "answer",
1232
						"final_answer": "done"
1233
					}`, 2, nil
1234
				})
1235

1236
			budget.EXPECT().ChargeLLMTokens(2, now)
1237

1238
			_, err := agent.RunAgentGoal(ctx, "Goal that triggers large prompt")
1239
			Expect(err).NotTo(HaveOccurred())
1240

1241
			ts := agent.TranscriptString()
1242

1243
			// This is the actual regression check:
1244
			// transcript must be capped at <= 120 bytes.
1245
			Expect(len([]byte(ts))).To(BeNumerically("<=", 120))
1246

1247
			// And it should show your truncation banner if it overflowed.
1248
			// (banner text is "\n…(truncated)\n")
1249
			Expect(ts).To(ContainSubstring("…(truncated)"))
1250

1251
			// Optional: if you used the suggested prompt logging tag:
1252
			// Expect(ts).To(ContainSubstring("[iteration 1][prompt]"))
1253
		})
1254
	})
1255
}
1256

1257
Product

Resources

Company