Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
kardolus
GitHub Repository: kardolus/chatgpt-cli
Path: blob/main/agent/react/react_agent_test.go
3433 views
1
package react_test
2
3
import (
4
"context"
5
"errors"
6
"fmt"
7
"github.com/kardolus/chatgpt-cli/agent/core"
8
"github.com/kardolus/chatgpt-cli/agent/react"
9
"github.com/kardolus/chatgpt-cli/agent/types"
10
"testing"
11
"time"
12
13
"github.com/golang/mock/gomock"
14
. "github.com/onsi/gomega"
15
"github.com/sclevine/spec"
16
"github.com/sclevine/spec/report"
17
)
18
19
//go:generate mockgen -destination=runnermocks_test.go -package=react_test github.com/kardolus/chatgpt-cli/agent/core Runner
20
//go:generate mockgen -destination=clockmocks_test.go -package=react_test github.com/kardolus/chatgpt-cli/agent/core Clock
21
//go:generate mockgen -destination=llmmocks_test.go -package=react_test github.com/kardolus/chatgpt-cli/agent/tools LLM
22
//go:generate mockgen -destination=budgetmocks_test.go -package=react_test github.com/kardolus/chatgpt-cli/agent/core Budget
23
24
func TestUnitReAct(t *testing.T) {
25
spec.Run(t, "Testing ReActAgent", testReActAgent, spec.Report(report.Terminal{}))
26
}
27
28
func testReActAgent(t *testing.T, when spec.G, it spec.S) {
29
var (
30
ctrl *gomock.Controller
31
llm *MockLLM
32
runner *MockRunner
33
budget *MockBudget
34
clock *MockClock
35
36
reactAgent *react.ReActAgent
37
ctx context.Context
38
now time.Time
39
)
40
41
it.Before(func() {
42
RegisterTestingT(t)
43
44
ctrl = gomock.NewController(t)
45
llm = NewMockLLM(ctrl)
46
runner = NewMockRunner(ctrl)
47
budget = NewMockBudget(ctrl)
48
clock = NewMockClock(ctrl)
49
50
reactAgent = react.NewReActAgent(llm, runner, budget, clock)
51
ctx = context.Background()
52
now = time.Date(2026, 1, 15, 10, 0, 0, 0, time.UTC)
53
54
clock.EXPECT().Now().Return(now).AnyTimes()
55
})
56
57
it.After(func() {
58
ctrl.Finish()
59
})
60
61
when("LLM returns final answer immediately", func() {
62
it("returns the answer without tool calls", func() {
63
64
budget.EXPECT().AllowIteration(now).Return(nil) // NEW
65
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{}) // NEW (no token limit)
66
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
67
68
llm.EXPECT().
69
Complete(gomock.Any(), gomock.Any()).
70
Return(`{
71
"thought": "The answer is simple",
72
"action_type": "answer",
73
"final_answer": "42"
74
}`, 10, nil)
75
76
budget.EXPECT().ChargeLLMTokens(10, now)
77
78
_, err := reactAgent.RunAgentGoal(ctx, "What is the answer?")
79
Expect(err).NotTo(HaveOccurred())
80
})
81
})
82
83
when("LLM uses a shell tool then answers", func() {
84
it("executes the tool and returns the final answer", func() {
85
86
// Iteration 1: tool
87
budget.EXPECT().AllowIteration(now).Return(nil)
88
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
89
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
90
91
llm.EXPECT().
92
Complete(gomock.Any(), gomock.Any()).
93
Return(`{
94
"thought": "I need to list files",
95
"action_type": "tool",
96
"tool": "shell",
97
"command": "ls",
98
"args": ["-la"]
99
}`, 15, nil)
100
101
budget.EXPECT().ChargeLLMTokens(15, now)
102
103
runner.EXPECT().
104
RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
105
DoAndReturn(func(_ context.Context, _ types.Config, step types.Step) (types.StepResult, error) {
106
Expect(step.Type).To(Equal(types.ToolShell))
107
Expect(step.Command).To(Equal("ls"))
108
Expect(step.Args).To(Equal([]string{"-la"}))
109
return types.StepResult{
110
Outcome: types.OutcomeOK,
111
Output: "file1.txt\nfile2.txt",
112
Duration: 100 * time.Millisecond,
113
}, nil
114
})
115
116
// Iteration 2: answer
117
budget.EXPECT().AllowIteration(now).Return(nil)
118
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
119
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
120
121
llm.EXPECT().
122
Complete(gomock.Any(), gomock.Any()).
123
DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
124
Expect(prompt).To(ContainSubstring("OBSERVATION: file1.txt"))
125
return `{
126
"thought": "I have the file list",
127
"action_type": "answer",
128
"final_answer": "There are 2 files"
129
}`, 12, nil
130
})
131
132
budget.EXPECT().ChargeLLMTokens(12, now)
133
134
_, err := reactAgent.RunAgentGoal(ctx, "How many files?")
135
Expect(err).NotTo(HaveOccurred())
136
})
137
})
138
139
when("Budget is exceeded", func() {
140
it("returns Budget error", func() {
141
142
budget.EXPECT().AllowIteration(now).Return(nil) // allow iteration
143
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
144
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(core.BudgetExceededError{
145
Kind: core.BudgetKindLLM,
146
Limit: 5,
147
Used: 5,
148
Message: "LLM call Budget exceeded",
149
})
150
151
_, err := reactAgent.RunAgentGoal(ctx, "Do something")
152
Expect(err).To(HaveOccurred())
153
Expect(err.Error()).To(ContainSubstring("LLM call Budget exceeded"))
154
})
155
})
156
157
when("LLM returns invalid JSON", func() {
158
it("recovers in-band and returns a final answer", func() {
159
160
// Iteration 1: invalid output
161
budget.EXPECT().AllowIteration(now).Return(nil)
162
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
163
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
164
165
llm.EXPECT().
166
Complete(gomock.Any(), gomock.Any()).
167
Return("not valid json", 5, nil)
168
169
budget.EXPECT().ChargeLLMTokens(5, now)
170
171
// Iteration 2: recovery prompt, model now returns valid answer JSON
172
budget.EXPECT().AllowIteration(now).Return(nil)
173
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
174
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
175
176
llm.EXPECT().
177
Complete(gomock.Any(), gomock.Any()).
178
DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
179
Expect(prompt).To(ContainSubstring("ACTION_TAKEN: tool=LLM details=INVALID_RESPONSE"))
180
Expect(prompt).To(ContainSubstring("OBSERVATION: ERROR: Your last response violated the ReAct protocol"))
181
Expect(prompt).To(ContainSubstring("failed to locate JSON object"))
182
Expect(prompt).To(ContainSubstring(`Raw response (truncated): "not valid json"`))
183
return `{
184
"thought": "ok",
185
"action_type": "answer",
186
"final_answer": "recovered"
187
}`, 7, nil
188
})
189
190
budget.EXPECT().ChargeLLMTokens(7, now)
191
192
res, err := reactAgent.RunAgentGoal(ctx, "Do something")
193
Expect(err).NotTo(HaveOccurred())
194
Expect(res).To(Equal("recovered"))
195
})
196
197
it("recovers in-band, then hard-fails after max parse recoveries", func() {
198
199
// We expect 4 attempts total:
200
// 1..3 => recovery continues
201
// 4 => parseRecoveries becomes 4 (>3) => returns error
202
for i := 0; i < 4; i++ {
203
budget.EXPECT().AllowIteration(now).Return(nil)
204
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
205
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
206
207
if i == 0 {
208
llm.EXPECT().
209
Complete(gomock.Any(), gomock.Any()).
210
Return("not valid json", 5, nil)
211
} else {
212
// From attempt 2 onward, prompt should include the recovery observations.
213
llm.EXPECT().
214
Complete(gomock.Any(), gomock.Any()).
215
DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
216
Expect(prompt).To(ContainSubstring("ACTION_TAKEN: tool=LLM details=INVALID_RESPONSE"))
217
Expect(prompt).To(ContainSubstring("OBSERVATION: ERROR: Your last response violated the ReAct protocol"))
218
Expect(prompt).To(ContainSubstring(`"action_type"`))
219
return "not valid json", 5, nil
220
})
221
}
222
223
budget.EXPECT().ChargeLLMTokens(5, now)
224
}
225
226
_, err := reactAgent.RunAgentGoal(ctx, "Do something")
227
Expect(err).To(HaveOccurred())
228
Expect(err.Error()).To(ContainSubstring("agent failed to produce valid JSON after 3 attempts"))
229
Expect(err.Error()).To(ContainSubstring("failed to locate JSON"))
230
})
231
})
232
233
when("LLM returns JSON with missing action_type", func() {
234
it("recovers in-band and returns a final answer", func() {
235
236
// Iteration 1: parse succeeds, validation fails (missing action_type) -> triggers recovery path
237
budget.EXPECT().AllowIteration(now).Return(nil)
238
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
239
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
240
241
llm.EXPECT().
242
Complete(gomock.Any(), gomock.Any()).
243
Return(`{"thought":"thinking"}`, 5, nil)
244
245
budget.EXPECT().ChargeLLMTokens(5, now)
246
247
// Iteration 2: recovery prompt, model returns valid answer JSON
248
budget.EXPECT().AllowIteration(now).Return(nil)
249
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
250
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
251
252
llm.EXPECT().
253
Complete(gomock.Any(), gomock.Any()).
254
DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
255
Expect(prompt).To(ContainSubstring("ACTION_TAKEN: tool=LLM details=INVALID_RESPONSE"))
256
Expect(prompt).To(ContainSubstring("OBSERVATION: ERROR: Your last response violated the ReAct protocol"))
257
Expect(prompt).To(ContainSubstring("missing action_type"))
258
// raw is JSON, so the snippet should include it
259
Expect(prompt).To(ContainSubstring(`Raw response (truncated): "{\"thought\":\"thinking\"}"`))
260
return `{
261
"thought": "ok",
262
"action_type": "answer",
263
"final_answer": "recovered"
264
}`, 7, nil
265
})
266
267
budget.EXPECT().ChargeLLMTokens(7, now)
268
269
res, err := reactAgent.RunAgentGoal(ctx, "Do something")
270
Expect(err).NotTo(HaveOccurred())
271
Expect(res).To(Equal("recovered"))
272
})
273
274
it("recovers in-band, then hard-fails after max parse recoveries", func() {
275
276
for i := 0; i < 4; i++ {
277
budget.EXPECT().AllowIteration(now).Return(nil)
278
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
279
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
280
281
if i == 0 {
282
llm.EXPECT().
283
Complete(gomock.Any(), gomock.Any()).
284
Return(`{"thought":"thinking"}`, 5, nil)
285
} else {
286
llm.EXPECT().
287
Complete(gomock.Any(), gomock.Any()).
288
DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
289
Expect(prompt).To(ContainSubstring("ACTION_TAKEN: tool=LLM details=INVALID_RESPONSE"))
290
Expect(prompt).To(ContainSubstring("OBSERVATION: ERROR: Your last response violated the ReAct protocol"))
291
Expect(prompt).To(ContainSubstring("missing action_type"))
292
return `{"thought":"thinking"}`, 5, nil
293
})
294
}
295
296
budget.EXPECT().ChargeLLMTokens(5, now)
297
}
298
299
_, err := reactAgent.RunAgentGoal(ctx, "Do something")
300
Expect(err).To(HaveOccurred())
301
Expect(err.Error()).To(ContainSubstring("agent failed to produce valid JSON after 3 attempts"))
302
Expect(err.Error()).To(ContainSubstring("missing action_type"))
303
})
304
})
305
306
when("tool execution fails", func() {
307
it("returns the execution error", func() {
308
309
budget.EXPECT().AllowIteration(now).Return(nil)
310
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
311
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
312
313
llm.EXPECT().
314
Complete(gomock.Any(), gomock.Any()).
315
Return(`{
316
"thought": "running command",
317
"action_type": "tool",
318
"tool": "shell",
319
"command": "false"
320
}`, 10, nil)
321
322
budget.EXPECT().ChargeLLMTokens(10, now)
323
324
runner.EXPECT().
325
RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
326
Return(types.StepResult{
327
Outcome: types.OutcomeError,
328
Transcript: "command failed",
329
}, errors.New("exit 1"))
330
331
_, err := reactAgent.RunAgentGoal(ctx, "Run false")
332
Expect(err).To(HaveOccurred())
333
Expect(err.Error()).To(ContainSubstring("exit 1"))
334
})
335
})
336
337
when("iteration Budget is exceeded", func() {
338
it("returns iteration Budget exceeded error", func() {
339
340
// 10 successful iterations, then fail on 11th AllowIteration
341
for i := 0; i < 10; i++ {
342
budget.EXPECT().AllowIteration(now).Return(nil)
343
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
344
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
345
346
llm.EXPECT().
347
Complete(gomock.Any(), gomock.Any()).
348
Return(fmt.Sprintf(`{
349
"thought": "still working",
350
"action_type": "tool",
351
"tool": "shell",
352
"command": "echo",
353
"args": ["test-%d"]
354
}`, i), 10, nil)
355
356
budget.EXPECT().ChargeLLMTokens(10, now)
357
}
358
359
// We expect 10 tool executions total.
360
runner.EXPECT().
361
RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
362
Times(10).
363
Return(types.StepResult{
364
Outcome: types.OutcomeOK,
365
Output: "ok",
366
Duration: 10 * time.Millisecond,
367
}, nil)
368
369
budget.EXPECT().AllowIteration(now).Return(core.BudgetExceededError{
370
Kind: core.BudgetKindIterations,
371
Limit: 10,
372
Used: 10,
373
Message: "iteration Budget exceeded",
374
})
375
376
_, err := reactAgent.RunAgentGoal(ctx, "Keep looping")
377
Expect(err).To(HaveOccurred())
378
Expect(err.Error()).To(ContainSubstring("iteration Budget exceeded"))
379
})
380
})
381
382
when("LLM output has markdown code fences", func() {
383
it("strips the fences and parses correctly", func() {
384
385
budget.EXPECT().AllowIteration(now).Return(nil)
386
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
387
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
388
389
llm.EXPECT().
390
Complete(gomock.Any(), gomock.Any()).
391
Return("```json\n{\"thought\": \"done\", \"action_type\": \"answer\", \"final_answer\": \"Success\"}\n```", 10, nil)
392
393
budget.EXPECT().ChargeLLMTokens(10, now)
394
395
_, err := reactAgent.RunAgentGoal(ctx, "Test markdown")
396
Expect(err).NotTo(HaveOccurred())
397
})
398
399
when("shell tool missing command", func() {
400
it("injects error observation and lets LLM recover", func() {
401
402
// Iteration 1: invalid shell tool (missing command)
403
budget.EXPECT().AllowIteration(now).Return(nil)
404
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
405
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
406
407
llm.EXPECT().
408
Complete(gomock.Any(), gomock.Any()).
409
Return(`{
410
"thought": "using shell",
411
"action_type": "tool",
412
"tool": "shell",
413
"command": ""
414
}`, 10, nil)
415
416
budget.EXPECT().ChargeLLMTokens(10, now)
417
418
// Iteration 2: agent should surface validation error to the model as an observation,
419
// then model answers (or chooses a different tool).
420
budget.EXPECT().AllowIteration(now).Return(nil)
421
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
422
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
423
424
llm.EXPECT().
425
Complete(gomock.Any(), gomock.Any()).
426
DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
427
Expect(prompt).To(ContainSubstring("OBSERVATION: ERROR"))
428
Expect(prompt).To(ContainSubstring("shell tool requires command"))
429
return `{
430
"thought": "ok",
431
"action_type": "answer",
432
"final_answer": "cannot run shell without a command"
433
}`, 1, nil
434
})
435
436
budget.EXPECT().ChargeLLMTokens(1, now)
437
438
res, err := reactAgent.RunAgentGoal(ctx, "Test")
439
Expect(err).NotTo(HaveOccurred())
440
Expect(res).To(Equal("cannot run shell without a command"))
441
})
442
})
443
})
444
445
when("LLM uses shorthand action_type like file/shell/LLM", func() {
446
it("treats action_type=file as a tool call", func() {
447
448
budget.EXPECT().AllowIteration(now).Return(nil)
449
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
450
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
451
452
// NOTE: action_type is "file" (shorthand). tool is omitted.
453
llm.EXPECT().
454
Complete(gomock.Any(), gomock.Any()).
455
Return(`{
456
"thought": "read it",
457
"action_type": "file",
458
"op": "read",
459
"path": "AGENTS.md"
460
}`, 10, nil)
461
462
budget.EXPECT().ChargeLLMTokens(10, now)
463
464
runner.EXPECT().
465
RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
466
DoAndReturn(func(_ context.Context, _ types.Config, step types.Step) (types.StepResult, error) {
467
Expect(step.Type).To(Equal(types.ToolFiles))
468
Expect(step.Op).To(Equal("read"))
469
Expect(step.Path).To(Equal("AGENTS.md"))
470
return types.StepResult{
471
Outcome: types.OutcomeOK,
472
Output: "ok",
473
Duration: 1 * time.Millisecond,
474
}, nil
475
})
476
477
// Next iteration: answer
478
budget.EXPECT().AllowIteration(now).Return(nil)
479
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
480
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
481
482
llm.EXPECT().
483
Complete(gomock.Any(), gomock.Any()).
484
Return(`{
485
"thought": "done",
486
"action_type": "answer",
487
"final_answer": "ok"
488
}`, 5, nil)
489
490
budget.EXPECT().ChargeLLMTokens(5, now)
491
492
_, err := reactAgent.RunAgentGoal(ctx, "Read AGENTS")
493
Expect(err).NotTo(HaveOccurred())
494
})
495
})
496
497
when("LLM returns multiple JSON objects back-to-back", func() {
498
it("parses only the first JSON object", func() {
499
500
budget.EXPECT().AllowIteration(now).Return(nil)
501
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
502
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
503
504
// Two JSON objects concatenated. parseReActResponse should take only the first.
505
llm.EXPECT().
506
Complete(gomock.Any(), gomock.Any()).
507
Return(
508
`{"thought":"one","action_type":"answer","final_answer":"A"}{"thought":"two","action_type":"answer","final_answer":"B"}`,
509
10,
510
nil,
511
)
512
513
budget.EXPECT().ChargeLLMTokens(10, now)
514
515
res, err := reactAgent.RunAgentGoal(ctx, "Test")
516
Expect(err).NotTo(HaveOccurred())
517
Expect(res).To(Equal("A"))
518
})
519
})
520
521
when("LLM repeats the same tool call twice in a row", func() {
522
it("injects a repetition observation and forces a different next step", func() {
523
524
// Iteration 1: tool
525
budget.EXPECT().AllowIteration(now).Return(nil)
526
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
527
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
528
529
llm.EXPECT().
530
Complete(gomock.Any(), gomock.Any()).
531
Return(`{
532
"thought": "do it",
533
"action_type": "tool",
534
"tool": "shell",
535
"command": "ls",
536
"args": ["-la"]
537
}`, 10, nil)
538
539
budget.EXPECT().ChargeLLMTokens(10, now)
540
541
// Only ONE tool execution should happen (iteration 1).
542
runner.EXPECT().
543
RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
544
Times(1).
545
Return(types.StepResult{
546
Outcome: types.OutcomeOK,
547
Output: "file1\nfile2\n",
548
Duration: 1 * time.Millisecond,
549
}, nil)
550
551
// Iteration 2: same tool again (should be blocked by repetition guard)
552
budget.EXPECT().AllowIteration(now).Return(nil)
553
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
554
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
555
556
llm.EXPECT().
557
Complete(gomock.Any(), gomock.Any()).
558
Return(`{
559
"thought": "try again",
560
"action_type": "tool",
561
"tool": "shell",
562
"command": "ls",
563
"args": ["-la"]
564
}`, 10, nil)
565
566
budget.EXPECT().ChargeLLMTokens(10, now)
567
568
// Iteration 3: must see injected repetition message in prompt, then answer
569
budget.EXPECT().AllowIteration(now).Return(nil)
570
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
571
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
572
573
llm.EXPECT().
574
Complete(gomock.Any(), gomock.Any()).
575
DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
576
Expect(prompt).To(ContainSubstring("OBSERVATION: You are repeating the same tool call"))
577
return `{
578
"thought": "ok, I'll stop repeating",
579
"action_type": "answer",
580
"final_answer": "done"
581
}`, 5, nil
582
})
583
584
budget.EXPECT().ChargeLLMTokens(5, now)
585
586
res, err := reactAgent.RunAgentGoal(ctx, "List files")
587
Expect(err).NotTo(HaveOccurred())
588
Expect(res).To(Equal("done"))
589
})
590
})
591
592
when("LLM ignores repetition warnings", func() {
593
it("hard-stops after too many repeats in the rolling window", func() {
594
595
// We'll do 6 iterations total (1 executes, 2-5 skipped, 6 hard-stops)
596
for i := 0; i < 6; i++ {
597
budget.EXPECT().AllowIteration(now).Return(nil)
598
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
599
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
600
601
llm.EXPECT().
602
Complete(gomock.Any(), gomock.Any()).
603
Return(`{
604
"thought": "list files again",
605
"action_type": "tool",
606
"tool": "shell",
607
"command": "ls",
608
"args": ["-la"]
609
}`, 1, nil)
610
611
budget.EXPECT().ChargeLLMTokens(1, now)
612
}
613
614
// Only the FIRST iteration should actually execute the tool.
615
runner.EXPECT().
616
RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
617
Times(1).
618
DoAndReturn(func(_ context.Context, _ types.Config, step types.Step) (types.StepResult, error) {
619
Expect(step.Type).To(Equal(types.ToolShell))
620
Expect(step.Command).To(Equal("ls"))
621
Expect(step.Args).To(Equal([]string{"-la"}))
622
return types.StepResult{
623
Outcome: types.OutcomeOK,
624
Output: "file1\nfile2\n",
625
Duration: 10 * time.Millisecond,
626
}, nil
627
})
628
629
_, err := reactAgent.RunAgentGoal(ctx, "Loop forever")
630
Expect(err).To(HaveOccurred())
631
Expect(err.Error()).To(ContainSubstring("agent appears stuck"))
632
Expect(err.Error()).To(ContainSubstring("repeated tool call too many times"))
633
})
634
})
635
636
when("LLM uses shorthand action_type=file (no tool field)", func() {
637
it("treats it as a tool call and executes file op", func() {
638
639
// Iteration 1: shorthand file tool
640
budget.EXPECT().AllowIteration(now).Return(nil)
641
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
642
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
643
644
llm.EXPECT().
645
Complete(gomock.Any(), gomock.Any()).
646
Return(`{
647
"thought": "read README",
648
"action_type": "file",
649
"op": "read",
650
"path": "README.md"
651
}`, 10, nil)
652
653
budget.EXPECT().ChargeLLMTokens(10, now)
654
655
runner.EXPECT().
656
RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
657
DoAndReturn(func(_ context.Context, _ types.Config, step types.Step) (types.StepResult, error) {
658
Expect(step.Type).To(Equal(types.ToolFiles))
659
Expect(step.Op).To(Equal("read"))
660
Expect(step.Path).To(Equal("README.md"))
661
return types.StepResult{
662
Outcome: types.OutcomeOK,
663
Output: "README CONTENT",
664
Duration: 5 * time.Millisecond,
665
}, nil
666
})
667
668
// Iteration 2: answer
669
budget.EXPECT().AllowIteration(now).Return(nil)
670
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
671
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
672
673
llm.EXPECT().
674
Complete(gomock.Any(), gomock.Any()).
675
DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
676
Expect(prompt).To(ContainSubstring("OBSERVATION: README CONTENT"))
677
return `{
678
"thought": "done",
679
"action_type": "answer",
680
"final_answer": "ok"
681
}`, 1, nil
682
})
683
684
budget.EXPECT().ChargeLLMTokens(1, now)
685
686
_, err := reactAgent.RunAgentGoal(ctx, "Read README and answer")
687
Expect(err).NotTo(HaveOccurred())
688
})
689
})
690
691
when("LLM uses shorthand action_type=file AND also sets tool=file", func() {
692
it("still treats it as a tool call (compat mode)", func() {
693
694
// Iteration 1: shorthand but with tool field present
695
budget.EXPECT().AllowIteration(now).Return(nil)
696
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
697
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
698
699
llm.EXPECT().
700
Complete(gomock.Any(), gomock.Any()).
701
Return(`{
702
"thought": "read AGENTS",
703
"action_type": "file",
704
"tool": "file",
705
"op": "read",
706
"path": "AGENTS.md"
707
}`, 10, nil)
708
709
budget.EXPECT().ChargeLLMTokens(10, now)
710
711
runner.EXPECT().
712
RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
713
Return(types.StepResult{
714
Outcome: types.OutcomeOK,
715
Output: "AGENTS CONTENT",
716
Duration: 5 * time.Millisecond,
717
}, nil)
718
719
// Iteration 2: answer
720
budget.EXPECT().AllowIteration(now).Return(nil)
721
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
722
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
723
724
llm.EXPECT().
725
Complete(gomock.Any(), gomock.Any()).
726
Return(`{
727
"thought": "done",
728
"action_type": "answer",
729
"final_answer": "ok"
730
}`, 1, nil)
731
732
budget.EXPECT().ChargeLLMTokens(1, now)
733
734
_, err := reactAgent.RunAgentGoal(ctx, "Read AGENTS and answer")
735
Expect(err).NotTo(HaveOccurred())
736
})
737
})
738
739
when("LLM uses shorthand action_type=file but tool mismatches", func() {
740
it("recovers in-band rather than failing the whole run", func() {
741
742
// Iteration 1: invalid action_type/tool combination -> parseReActResponse returns invalid action_type
743
budget.EXPECT().AllowIteration(now).Return(nil)
744
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
745
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
746
747
llm.EXPECT().
748
Complete(gomock.Any(), gomock.Any()).
749
Return(`{
750
"thought": "oops",
751
"action_type": "file",
752
"tool": "shell",
753
"command": "ls"
754
}`, 10, nil)
755
756
budget.EXPECT().ChargeLLMTokens(10, now)
757
758
// Iteration 2: recovery prompt, model returns a valid final answer
759
budget.EXPECT().AllowIteration(now).Return(nil)
760
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
761
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
762
763
llm.EXPECT().
764
Complete(gomock.Any(), gomock.Any()).
765
DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
766
Expect(prompt).To(ContainSubstring("ACTION_TAKEN: tool=LLM details=INVALID_RESPONSE"))
767
Expect(prompt).To(ContainSubstring("OBSERVATION: ERROR: Your last response violated the ReAct protocol"))
768
Expect(prompt).To(ContainSubstring(`invalid action_type: "file"`))
769
Expect(prompt).To(ContainSubstring(`"tool": "shell"`))
770
return `{
771
"thought": "ack, correct schema",
772
"action_type": "answer",
773
"final_answer": "recovered"
774
}`, 5, nil
775
})
776
777
budget.EXPECT().ChargeLLMTokens(5, now)
778
779
res, err := reactAgent.RunAgentGoal(ctx, "Bad shorthand")
780
Expect(err).NotTo(HaveOccurred())
781
Expect(res).To(Equal("recovered"))
782
})
783
784
it("recovers in-band, then hard-fails after max parse recoveries", func() {
785
786
// This is a validation error inside parseReActResponse: invalid action_type: "file"
787
bad := `{
788
"thought": "oops",
789
"action_type": "file",
790
"tool": "shell",
791
"command": "ls"
792
}`
793
794
for i := 0; i < 4; i++ {
795
budget.EXPECT().AllowIteration(now).Return(nil)
796
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
797
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
798
799
if i == 0 {
800
llm.EXPECT().
801
Complete(gomock.Any(), gomock.Any()).
802
Return(bad, 10, nil)
803
} else {
804
llm.EXPECT().
805
Complete(gomock.Any(), gomock.Any()).
806
DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
807
Expect(prompt).To(ContainSubstring("ACTION_TAKEN: tool=LLM details=INVALID_RESPONSE"))
808
Expect(prompt).To(ContainSubstring("invalid action_type"))
809
Expect(prompt).To(ContainSubstring(`Raw response (truncated)`))
810
return bad, 10, nil
811
})
812
}
813
814
budget.EXPECT().ChargeLLMTokens(10, now)
815
}
816
817
_, err := reactAgent.RunAgentGoal(ctx, "Bad shorthand")
818
Expect(err).To(HaveOccurred())
819
Expect(err.Error()).To(ContainSubstring("agent failed to produce valid JSON after 3 attempts"))
820
Expect(err.Error()).To(ContainSubstring(`invalid action_type: "file"`))
821
})
822
})
823
824
when("LLM uses file patch", func() {
825
it("converts to a ToolFiles step and executes it", func() {
826
budget.EXPECT().AllowIteration(now).Return(nil)
827
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
828
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
829
830
llm.EXPECT().Complete(gomock.Any(), gomock.Any()).Return(`{
831
"thought":"apply diff",
832
"action_type":"tool",
833
"tool":"file",
834
"op":"patch",
835
"path":"a.txt",
836
"data":"--- a/a.txt\n+++ b/a.txt\n@@\n+hi\n"
837
}`, 1, nil)
838
budget.EXPECT().ChargeLLMTokens(1, now)
839
840
runner.EXPECT().RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
841
DoAndReturn(func(_ context.Context, _ types.Config, step types.Step) (types.StepResult, error) {
842
Expect(step.Type).To(Equal(types.ToolFiles))
843
Expect(step.Op).To(Equal("patch"))
844
Expect(step.Path).To(Equal("a.txt"))
845
Expect(step.Data).To(ContainSubstring("+++ b/a.txt"))
846
return types.StepResult{Outcome: types.OutcomeOK, Output: "patched"}, nil
847
})
848
849
// next iteration: answer
850
budget.EXPECT().AllowIteration(now).Return(nil)
851
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
852
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
853
854
llm.EXPECT().Complete(gomock.Any(), gomock.Any()).Return(`{
855
"thought":"done",
856
"action_type":"answer",
857
"final_answer":"ok"
858
}`, 1, nil)
859
budget.EXPECT().ChargeLLMTokens(1, now)
860
861
_, err := reactAgent.RunAgentGoal(ctx, "Patch a.txt")
862
Expect(err).NotTo(HaveOccurred())
863
})
864
})
865
866
when("LLM uses file replace", func() {
867
it("converts to a ToolFiles step with Old/New/N", func() {
868
budget.EXPECT().AllowIteration(now).Return(nil)
869
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
870
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
871
872
llm.EXPECT().Complete(gomock.Any(), gomock.Any()).Return(`{
873
"thought":"swap token",
874
"action_type":"tool",
875
"tool":"file",
876
"op":"replace",
877
"path":"a.txt",
878
"old":"foo",
879
"new":"bar",
880
"n":2
881
}`, 1, nil)
882
budget.EXPECT().ChargeLLMTokens(1, now)
883
884
runner.EXPECT().RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
885
DoAndReturn(func(_ context.Context, _ types.Config, step types.Step) (types.StepResult, error) {
886
Expect(step.Type).To(Equal(types.ToolFiles))
887
Expect(step.Op).To(Equal("replace"))
888
Expect(step.Path).To(Equal("a.txt"))
889
Expect(step.Old).To(Equal("foo"))
890
Expect(step.New).To(Equal("bar"))
891
Expect(step.N).To(Equal(2))
892
return types.StepResult{Outcome: types.OutcomeOK, Output: "replaced"}, nil
893
})
894
895
budget.EXPECT().AllowIteration(now).Return(nil)
896
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
897
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
898
llm.EXPECT().Complete(gomock.Any(), gomock.Any()).Return(`{
899
"thought":"done",
900
"action_type":"answer",
901
"final_answer":"ok"
902
}`, 1, nil)
903
budget.EXPECT().ChargeLLMTokens(1, now)
904
905
_, err := reactAgent.RunAgentGoal(ctx, "Replace in a.txt")
906
Expect(err).NotTo(HaveOccurred())
907
})
908
})
909
910
when("LLM uses file patch without data", func() {
911
it("injects error observation and lets LLM recover", func() {
912
// Iteration 1: invalid patch
913
budget.EXPECT().AllowIteration(now).Return(nil)
914
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
915
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
916
917
llm.EXPECT().Complete(gomock.Any(), gomock.Any()).Return(`{
918
"thought":"patch",
919
"action_type":"tool",
920
"tool":"file",
921
"op":"patch",
922
"path":"a.txt",
923
"data":" "
924
}`, 1, nil)
925
budget.EXPECT().ChargeLLMTokens(1, now)
926
927
// Iteration 2: model sees error and answers
928
budget.EXPECT().AllowIteration(now).Return(nil)
929
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
930
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
931
932
llm.EXPECT().Complete(gomock.Any(), gomock.Any()).
933
DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
934
Expect(prompt).To(ContainSubstring("file patch requires data"))
935
Expect(prompt).To(ContainSubstring("OBSERVATION: ERROR"))
936
return `{
937
"thought":"ok",
938
"action_type":"answer",
939
"final_answer":"fixed"
940
}`, 1, nil
941
})
942
budget.EXPECT().ChargeLLMTokens(1, now)
943
944
res, err := reactAgent.RunAgentGoal(ctx, "Patch")
945
Expect(err).NotTo(HaveOccurred())
946
Expect(res).To(Equal("fixed"))
947
})
948
})
949
950
when("LLM uses file replace without old", func() {
951
it("injects error observation and lets LLM recover", func() {
952
// Iteration 1: invalid replace
953
budget.EXPECT().AllowIteration(now).Return(nil)
954
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
955
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
956
957
llm.EXPECT().Complete(gomock.Any(), gomock.Any()).Return(`{
958
"thought":"replace",
959
"action_type":"tool",
960
"tool":"file",
961
"op":"replace",
962
"path":"a.txt",
963
"new":""
964
}`, 1, nil)
965
budget.EXPECT().ChargeLLMTokens(1, now)
966
967
// Iteration 2: recover
968
budget.EXPECT().AllowIteration(now).Return(nil)
969
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
970
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
971
972
llm.EXPECT().Complete(gomock.Any(), gomock.Any()).
973
DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
974
Expect(prompt).To(ContainSubstring("file replace requires old"))
975
Expect(prompt).To(ContainSubstring("OBSERVATION: ERROR"))
976
return `{
977
"thought":"ok",
978
"action_type":"answer",
979
"final_answer":"recovered"
980
}`, 1, nil
981
})
982
budget.EXPECT().ChargeLLMTokens(1, now)
983
984
res, err := reactAgent.RunAgentGoal(ctx, "Replace")
985
Expect(err).NotTo(HaveOccurred())
986
Expect(res).To(Equal("recovered"))
987
})
988
})
989
990
when("patch fails and agent falls back to full write", func() {
991
it("continues after patch failure observation and then writes", func() {
992
// Iteration 1: patch
993
budget.EXPECT().AllowIteration(now).Return(nil)
994
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
995
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
996
997
llm.EXPECT().Complete(gomock.Any(), gomock.Any()).Return(`{
998
"thought":"try patch first",
999
"action_type":"tool",
1000
"tool":"file",
1001
"op":"patch",
1002
"path":"a.txt",
1003
"data":"--- a/a.txt\n+++ b/a.txt\n@@\n+hi\n"
1004
}`, 1, nil)
1005
budget.EXPECT().ChargeLLMTokens(1, now)
1006
1007
runner.EXPECT().RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
1008
DoAndReturn(func(_ context.Context, _ types.Config, step types.Step) (types.StepResult, error) {
1009
Expect(step.Op).To(Equal("patch"))
1010
// IMPORTANT: err == nil, failure is conveyed in Output/Outcome
1011
return types.StepResult{
1012
Outcome: types.OutcomeError,
1013
Output: "patch failed: hunk did not apply",
1014
Duration: 1 * time.Millisecond,
1015
}, nil
1016
})
1017
1018
// Iteration 2: LLM sees patch failed and chooses full write
1019
budget.EXPECT().AllowIteration(now).Return(nil)
1020
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
1021
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
1022
1023
llm.EXPECT().Complete(gomock.Any(), gomock.Any()).
1024
DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
1025
Expect(prompt).To(ContainSubstring("OBSERVATION: ERROR:"))
1026
Expect(prompt).To(ContainSubstring("patch failed"))
1027
return `{
1028
"thought":"fallback to write full file",
1029
"action_type":"tool",
1030
"tool":"file",
1031
"op":"write",
1032
"path":"a.txt",
1033
"data":"FULL NEW CONTENT\n"
1034
}`, 1, nil
1035
})
1036
budget.EXPECT().ChargeLLMTokens(1, now)
1037
1038
runner.EXPECT().RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
1039
DoAndReturn(func(_ context.Context, _ types.Config, step types.Step) (types.StepResult, error) {
1040
Expect(step.Op).To(Equal("write"))
1041
Expect(step.Path).To(Equal("a.txt"))
1042
Expect(step.Data).To(Equal("FULL NEW CONTENT\n"))
1043
return types.StepResult{Outcome: types.OutcomeOK, Output: "wrote"}, nil
1044
})
1045
1046
// Iteration 3: answer
1047
budget.EXPECT().AllowIteration(now).Return(nil)
1048
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
1049
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
1050
1051
llm.EXPECT().Complete(gomock.Any(), gomock.Any()).Return(`{
1052
"thought":"done",
1053
"action_type":"answer",
1054
"final_answer":"ok"
1055
}`, 1, nil)
1056
budget.EXPECT().ChargeLLMTokens(1, now)
1057
1058
_, err := reactAgent.RunAgentGoal(ctx, "Modify a.txt")
1059
Expect(err).NotTo(HaveOccurred())
1060
})
1061
})
1062
1063
when("a step produces side effects", func() {
1064
it("includes STATE line with cumulative effects in the next prompt", func() {
1065
// iter 1
1066
budget.EXPECT().AllowIteration(now).Return(nil)
1067
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
1068
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
1069
1070
llm.EXPECT().Complete(gomock.Any(), gomock.Any()).Return(`{
1071
"thought":"write a file",
1072
"action_type":"tool",
1073
"tool":"file",
1074
"op":"write",
1075
"path":"a.txt",
1076
"data":"hi"
1077
}`, 1, nil)
1078
budget.EXPECT().ChargeLLMTokens(1, now)
1079
1080
runner.EXPECT().RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
1081
Return(types.StepResult{
1082
Outcome: types.OutcomeOK,
1083
Output: "wrote",
1084
Duration: 1 * time.Millisecond,
1085
Effects: types.Effects{
1086
{Kind: "file.write", Path: "a.txt"},
1087
},
1088
}, nil)
1089
1090
// iter 2
1091
budget.EXPECT().AllowIteration(now).Return(nil)
1092
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
1093
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
1094
1095
llm.EXPECT().Complete(gomock.Any(), gomock.Any()).
1096
DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
1097
Expect(prompt).To(ContainSubstring("State:"))
1098
Expect(prompt).To(ContainSubstring("file.write x1"))
1099
return `{
1100
"thought":"done",
1101
"action_type":"answer",
1102
"final_answer":"ok"
1103
}`, 1, nil
1104
})
1105
budget.EXPECT().ChargeLLMTokens(1, now)
1106
1107
_, err := reactAgent.RunAgentGoal(ctx, "Write")
1108
Expect(err).NotTo(HaveOccurred())
1109
})
1110
})
1111
1112
when("patch fails and agent injects FALLBACK REQUIRED guidance", func() {
1113
it("includes fallback-required instruction (read+write) in the next prompt", func() {
1114
// Iteration 1: model tries patch
1115
budget.EXPECT().AllowIteration(now).Return(nil)
1116
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
1117
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
1118
1119
llm.EXPECT().Complete(gomock.Any(), gomock.Any()).Return(`{
1120
"thought":"try patch first",
1121
"action_type":"tool",
1122
"tool":"file",
1123
"op":"patch",
1124
"path":"a.txt",
1125
"data":"--- a/a.txt\n+++ b/a.txt\n@@ -1,1 +1,1 @@\n-old\n+new\n"
1126
}`, 1, nil)
1127
budget.EXPECT().ChargeLLMTokens(1, now)
1128
1129
// Patch fails via OutcomeError (err == nil)
1130
runner.EXPECT().RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
1131
DoAndReturn(func(_ context.Context, _ types.Config, step types.Step) (types.StepResult, error) {
1132
Expect(step.Type).To(Equal(types.ToolFiles))
1133
Expect(step.Op).To(Equal("patch"))
1134
Expect(step.Path).To(Equal("a.txt"))
1135
return types.StepResult{
1136
Outcome: types.OutcomeError,
1137
Output: "invalid unified diff: missing hunk header",
1138
Duration: 1 * time.Millisecond,
1139
}, nil
1140
})
1141
1142
// Iteration 2: model sees fallback-required instruction (this is what we’re testing)
1143
budget.EXPECT().AllowIteration(now).Return(nil)
1144
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
1145
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
1146
1147
llm.EXPECT().Complete(gomock.Any(), gomock.Any()).
1148
DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
1149
// Normal error observation is still present
1150
Expect(prompt).To(ContainSubstring("OBSERVATION: ERROR:"))
1151
Expect(prompt).To(ContainSubstring("invalid unified diff"))
1152
1153
// New guardrail line(s)
1154
Expect(prompt).To(ContainSubstring("OBSERVATION: FALLBACK REQUIRED"))
1155
Expect(prompt).To(ContainSubstring(`Do NOT try op="patch" or op=patch/replace again`))
1156
1157
// Explicit JSON skeleton that forces read next
1158
Expect(prompt).To(ContainSubstring(`{"action_type":"tool","tool":"file","op":"read","path":"a.txt"}`))
1159
1160
// We can stop here with an answer; no need to actually execute the fallback in this unit test.
1161
return `{
1162
"thought":"ack",
1163
"action_type":"answer",
1164
"final_answer":"ok"
1165
}`, 1, nil
1166
})
1167
budget.EXPECT().ChargeLLMTokens(1, now)
1168
1169
_, err := reactAgent.RunAgentGoal(ctx, "Modify a.txt")
1170
Expect(err).NotTo(HaveOccurred())
1171
})
1172
})
1173
1174
when("agent already has transcript + history from a previous run", func() {
1175
it("resets them before starting a new run", func() {
1176
// Seed previous run leftovers
1177
reactAgent.AddTranscript("OLD_TRANSCRIPT_SHOULD_BE_CLEARED")
1178
reactAgent.AddHistory("OLD_HISTORY_SHOULD_BE_CLEARED")
1179
1180
budget.EXPECT().AllowIteration(now).Return(nil)
1181
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
1182
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
1183
1184
llm.EXPECT().
1185
Complete(gomock.Any(), gomock.Any()).
1186
Return(`{
1187
"thought": "ok",
1188
"action_type": "answer",
1189
"final_answer": "done"
1190
}`, 3, nil)
1191
1192
budget.EXPECT().ChargeLLMTokens(3, now)
1193
1194
res, err := reactAgent.RunAgentGoal(ctx, "New goal")
1195
Expect(err).NotTo(HaveOccurred())
1196
Expect(res).To(Equal("done"))
1197
1198
// Assert old data is gone
1199
Expect(reactAgent.TranscriptString()).NotTo(ContainSubstring("OLD_TRANSCRIPT_SHOULD_BE_CLEARED"))
1200
Expect(reactAgent.History()).NotTo(ContainSubstring("OLD_HISTORY_SHOULD_BE_CLEARED"))
1201
1202
// And the new goal is present (sanity check)
1203
Expect(reactAgent.TranscriptString()).To(ContainSubstring("[goal]"))
1204
Expect(reactAgent.TranscriptString()).To(ContainSubstring("New goal"))
1205
Expect(reactAgent.History()).To(ContainSubstring("USER: New goal"))
1206
})
1207
})
1208
1209
when("prompt logging is enabled and transcript max is small", func() {
1210
it("caps transcript length (and truncates) even if prompt is large", func() {
1211
// Create an agent with a tiny transcript buffer so truncation is guaranteed.
1212
// (We keep using the same mocks.)
1213
agent := react.NewReActAgent(
1214
llm, runner, budget, clock,
1215
core.WithTranscriptMaxBytes(120),
1216
core.WithPromptHistoryMaxBytes(120),
1217
)
1218
1219
budget.EXPECT().AllowIteration(now).Return(nil)
1220
budget.EXPECT().Snapshot(now).Return(core.BudgetSnapshot{})
1221
budget.EXPECT().AllowTool(types.ToolLLM, now).Return(nil)
1222
1223
llm.EXPECT().
1224
Complete(gomock.Any(), gomock.Any()).
1225
DoAndReturn(func(_ context.Context, prompt string) (string, int, error) {
1226
// The prompt should be big (your buildReActPrompt() is huge),
1227
// which is what makes prompt-logging risky if uncapped.
1228
Expect(len(prompt)).To(BeNumerically(">", 200))
1229
return `{
1230
"thought": "ok",
1231
"action_type": "answer",
1232
"final_answer": "done"
1233
}`, 2, nil
1234
})
1235
1236
budget.EXPECT().ChargeLLMTokens(2, now)
1237
1238
_, err := agent.RunAgentGoal(ctx, "Goal that triggers large prompt")
1239
Expect(err).NotTo(HaveOccurred())
1240
1241
ts := agent.TranscriptString()
1242
1243
// This is the actual regression check:
1244
// transcript must be capped at <= 120 bytes.
1245
Expect(len([]byte(ts))).To(BeNumerically("<=", 120))
1246
1247
// And it should show your truncation banner if it overflowed.
1248
// (banner text is "\n…(truncated)\n")
1249
Expect(ts).To(ContainSubstring("…(truncated)"))
1250
1251
// Optional: if you used the suggested prompt logging tag:
1252
// Expect(ts).To(ContainSubstring("[iteration 1][prompt]"))
1253
})
1254
})
1255
}
1256
1257