Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
kardolus
GitHub Repository: kardolus/chatgpt-cli
Path: blob/main/agent/planexec/plan_execute_agent_test.go
3434 views
1
package planexec_test
2
3
import (
4
"context"
5
"fmt"
6
"github.com/kardolus/chatgpt-cli/agent/core"
7
"github.com/kardolus/chatgpt-cli/agent/planexec"
8
"github.com/kardolus/chatgpt-cli/agent/types"
9
"strings"
10
"testing"
11
"time"
12
13
"github.com/golang/mock/gomock"
14
. "github.com/onsi/gomega"
15
"github.com/sclevine/spec"
16
"github.com/sclevine/spec/report"
17
)
18
19
//go:generate mockgen -destination=runnermocks_test.go -package=planexec_test github.com/kardolus/chatgpt-cli/agent/core Runner
20
//go:generate mockgen -destination=clockmocks_test.go -package=planexec_test github.com/kardolus/chatgpt-cli/agent/core Clock
21
//go:generate mockgen -destination=plannermocks_test.go -package=planexec_test github.com/kardolus/chatgpt-cli/agent/planexec Planner
22
23
func TestUnitAgent(t *testing.T) {
24
spec.Run(t, "Testing the plan-execute agent", testPlanExecuteAgent, spec.Report(report.Terminal{}))
25
}
26
27
func testPlanExecuteAgent(t *testing.T, when spec.G, it spec.S) {
28
var (
29
mockCtrl *gomock.Controller
30
mockClock *MockClock
31
mockRunner *MockRunner
32
mockPlanner *MockPlanner
33
)
34
35
it.Before(func() {
36
RegisterTestingT(t)
37
mockCtrl = gomock.NewController(t)
38
mockClock = NewMockClock(mockCtrl)
39
mockPlanner = NewMockPlanner(mockCtrl)
40
mockRunner = NewMockRunner(mockCtrl)
41
})
42
43
it.After(func() {
44
mockCtrl.Finish()
45
})
46
47
when("RunAgentGoal()", func() {
48
const goal = "test goal"
49
50
it.Before(func() {
51
expectAgentDuration(mockClock, 123*time.Millisecond)
52
})
53
54
it("should bubble up Planner errors and not run any steps", func() {
55
planErr := fmt.Errorf("Planner boom")
56
57
mockPlanner.
58
EXPECT().
59
Plan(gomock.Any(), goal).
60
Return(types.Plan{}, planErr).
61
Times(1)
62
63
// Runner must not be invoked if planning fails
64
mockRunner.
65
EXPECT().
66
RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
67
Times(0)
68
69
subject := planexec.NewPlanExecuteAgent(
70
mockClock,
71
mockPlanner,
72
mockRunner,
73
)
74
75
_, err := subject.RunAgentGoal(context.Background(), goal)
76
Expect(err).To(MatchError(planErr))
77
})
78
79
it("should bubble up Runner errors and stop executing further steps", func() {
80
runErr := fmt.Errorf("Runner boom")
81
82
plan := types.Plan{
83
Goal: goal,
84
Steps: []types.Step{
85
{Type: types.ToolShell, Description: "step 1", Command: "echo", Args: []string{"one"}},
86
{Type: types.ToolShell, Description: "step 2", Command: "echo", Args: []string{"two"}},
87
},
88
}
89
90
mockPlanner.
91
EXPECT().
92
Plan(gomock.Any(), goal).
93
Return(plan, nil).
94
Times(1)
95
96
mockRunner.
97
EXPECT().
98
RunStep(gomock.Any(), gomock.Any(), plan.Steps[0]).
99
Return(types.StepResult{}, runErr).
100
Times(1)
101
102
// Guard: step 2 must not run
103
mockRunner.
104
EXPECT().
105
RunStep(gomock.Any(), gomock.Any(), plan.Steps[1]).
106
Times(0)
107
108
subject := planexec.NewPlanExecuteAgent(
109
mockClock,
110
mockPlanner,
111
mockRunner,
112
)
113
114
_, err := subject.RunAgentGoal(context.Background(), goal)
115
Expect(err).To(MatchError(runErr))
116
})
117
118
it("should return an error when Runner returns OutcomeError (even if err == nil)", func() {
119
plan := types.Plan{
120
Goal: goal,
121
Steps: []types.Step{
122
{Type: types.ToolShell, Description: "step 1", Command: "false", Args: nil},
123
},
124
}
125
126
mockPlanner.
127
EXPECT().
128
Plan(gomock.Any(), goal).
129
Return(plan, nil).
130
Times(1)
131
132
mockRunner.
133
EXPECT().
134
RunStep(gomock.Any(), gomock.Any(), plan.Steps[0]).
135
Return(types.StepResult{
136
Step: plan.Steps[0],
137
Outcome: types.OutcomeError,
138
Exec: &types.Result{ExitCode: 42}, // optional; Agent no longer inspects this
139
}, nil).
140
Times(1)
141
142
subject := planexec.NewPlanExecuteAgent(
143
mockClock,
144
mockPlanner,
145
mockRunner,
146
)
147
148
_, err := subject.RunAgentGoal(context.Background(), goal)
149
Expect(err).To(MatchError(`step failed: step 1`))
150
})
151
152
it("should succeed when Planner returns an empty plan and not run any steps", func() {
153
plan := types.Plan{
154
Goal: goal,
155
Steps: nil,
156
}
157
158
mockPlanner.EXPECT().
159
Plan(gomock.Any(), goal).
160
Return(plan, nil).
161
Times(1)
162
163
mockRunner.EXPECT().
164
RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
165
Times(0)
166
167
subject := planexec.NewPlanExecuteAgent(
168
mockClock,
169
mockPlanner,
170
mockRunner,
171
)
172
173
out, err := subject.RunAgentGoal(context.Background(), goal)
174
Expect(err).NotTo(HaveOccurred())
175
Expect(out).To(BeEmpty())
176
})
177
178
it("should stop executing further steps when Runner returns OutcomeError", func() {
179
plan := types.Plan{
180
Goal: goal,
181
Steps: []types.Step{
182
{Type: types.ToolShell, Description: "step 1", Command: "false", Args: nil},
183
{Type: types.ToolShell, Description: "step 2", Command: "echo", Args: []string{"should-not-run"}},
184
},
185
}
186
187
mockPlanner.EXPECT().
188
Plan(gomock.Any(), goal).
189
Return(plan, nil).
190
Times(1)
191
192
mockRunner.EXPECT().
193
RunStep(gomock.Any(), gomock.Any(), plan.Steps[0]).
194
Return(types.StepResult{
195
Step: plan.Steps[0],
196
Outcome: types.OutcomeError,
197
Exec: &types.Result{ExitCode: 7},
198
}, nil).
199
Times(1)
200
201
// Guard: step 2 must not run
202
mockRunner.EXPECT().
203
RunStep(gomock.Any(), gomock.Any(), plan.Steps[1]).
204
Times(0)
205
206
subject := planexec.NewPlanExecuteAgent(
207
mockClock,
208
mockPlanner,
209
mockRunner,
210
)
211
212
_, err := subject.RunAgentGoal(context.Background(), goal)
213
Expect(err).To(MatchError(`step failed: step 1`))
214
})
215
216
it("should treat Exec == nil as success and continue to next step", func() {
217
plan := types.Plan{
218
Goal: goal,
219
Steps: []types.Step{
220
{Type: types.ToolLLM, Description: "llm step (no exec)", Prompt: "do something"},
221
{Type: types.ToolShell, Description: "shell step", Command: "echo", Args: []string{"ok"}},
222
},
223
}
224
225
mockPlanner.EXPECT().
226
Plan(gomock.Any(), goal).
227
Return(plan, nil).
228
Times(1)
229
230
// First step: Exec is nil, no error, OutcomeOK => success.
231
mockRunner.EXPECT().
232
RunStep(gomock.Any(), gomock.Any(), plan.Steps[0]).
233
Return(types.StepResult{
234
Step: plan.Steps[0],
235
Outcome: types.OutcomeOK,
236
Exec: nil,
237
}, nil).
238
Times(1)
239
240
// Second step should still run.
241
mockRunner.EXPECT().
242
RunStep(gomock.Any(), gomock.Any(), plan.Steps[1]).
243
Return(types.StepResult{
244
Step: plan.Steps[1],
245
Outcome: types.OutcomeOK,
246
Exec: &types.Result{ExitCode: 0},
247
}, nil).
248
Times(1)
249
250
subject := planexec.NewPlanExecuteAgent(
251
mockClock,
252
mockPlanner,
253
mockRunner,
254
)
255
256
_, err := subject.RunAgentGoal(context.Background(), goal)
257
Expect(err).NotTo(HaveOccurred())
258
})
259
260
it("WithWorkDir should pass cfg.WorkDir into the Runner", func() {
261
plan := types.Plan{
262
Goal: goal,
263
Steps: []types.Step{
264
{Type: types.ToolShell, Description: "step 1", Command: "echo", Args: []string{"one"}},
265
},
266
}
267
268
mockPlanner.EXPECT().
269
Plan(gomock.Any(), goal).
270
Return(plan, nil).
271
Times(1)
272
273
mockRunner.EXPECT().
274
RunStep(gomock.Any(), gomock.Any(), plan.Steps[0]).
275
DoAndReturn(func(_ context.Context, cfg types.Config, _ types.Step) (types.StepResult, error) {
276
Expect(cfg.WorkDir).To(Equal("/tmp/my-workdir"))
277
return types.StepResult{Step: plan.Steps[0], Outcome: types.OutcomeOK}, nil
278
}).
279
Times(1)
280
281
subject := planexec.NewPlanExecuteAgent(
282
mockClock,
283
mockPlanner,
284
mockRunner,
285
core.WithWorkDir("/tmp/my-workdir"),
286
)
287
288
_, err := subject.RunAgentGoal(context.Background(), goal)
289
Expect(err).NotTo(HaveOccurred())
290
})
291
292
it("WithDryRun should pass cfg.DryRun into the Runner", func() {
293
plan := types.Plan{
294
Goal: goal,
295
Steps: []types.Step{
296
{Type: types.ToolShell, Description: "step 1", Command: "echo", Args: []string{"one"}},
297
},
298
}
299
300
mockPlanner.EXPECT().
301
Plan(gomock.Any(), goal).
302
Return(plan, nil).
303
Times(1)
304
305
mockRunner.EXPECT().
306
RunStep(gomock.Any(), gomock.Any(), plan.Steps[0]).
307
DoAndReturn(func(_ context.Context, cfg types.Config, _ types.Step) (types.StepResult, error) {
308
Expect(cfg.DryRun).To(BeTrue())
309
return types.StepResult{Step: plan.Steps[0], Outcome: types.OutcomeDryRun}, nil
310
}).
311
Times(1)
312
313
subject := planexec.NewPlanExecuteAgent(
314
mockClock,
315
mockPlanner,
316
mockRunner,
317
core.WithDryRun(true),
318
)
319
320
_, err := subject.RunAgentGoal(context.Background(), goal)
321
Expect(err).NotTo(HaveOccurred())
322
})
323
324
it("happy path: should return the output of the final step", func() {
325
const goal = "do the thing"
326
327
plan := types.Plan{
328
Goal: goal,
329
Steps: []types.Step{
330
{Type: types.ToolLLM, Description: "step 1", Prompt: "first"},
331
{Type: types.ToolLLM, Description: "step 2", Prompt: "second"},
332
},
333
}
334
335
mockPlanner.EXPECT().
336
Plan(gomock.Any(), goal).
337
Return(plan, nil).
338
Times(1)
339
340
mockRunner.EXPECT().
341
RunStep(gomock.Any(), gomock.Any(), plan.Steps[0]).
342
Return(types.StepResult{
343
Step: plan.Steps[0],
344
Outcome: types.OutcomeOK,
345
Output: "A",
346
}, nil).
347
Times(1)
348
349
mockRunner.EXPECT().
350
RunStep(gomock.Any(), gomock.Any(), plan.Steps[1]).
351
Return(types.StepResult{
352
Step: plan.Steps[1],
353
Outcome: types.OutcomeOK,
354
Output: "B",
355
}, nil).
356
Times(1)
357
358
subject := planexec.NewPlanExecuteAgent(
359
mockClock,
360
mockPlanner,
361
mockRunner,
362
)
363
364
out, err := subject.RunAgentGoal(context.Background(), goal)
365
Expect(err).NotTo(HaveOccurred())
366
Expect(out).To(Equal("B")) // last step wins
367
})
368
369
it("should run all planned steps (PlanExecuteAgent no longer enforces MaxSteps)", func() {
370
plan := types.Plan{
371
Goal: goal,
372
Steps: []types.Step{
373
{Type: types.ToolShell, Description: "step 1", Command: "echo", Args: []string{"one"}},
374
{Type: types.ToolShell, Description: "step 2", Command: "echo", Args: []string{"two"}},
375
{Type: types.ToolShell, Description: "step 3", Command: "echo", Args: []string{"three"}},
376
},
377
}
378
379
mockPlanner.EXPECT().
380
Plan(gomock.Any(), goal).
381
Return(plan, nil).
382
Times(1)
383
384
gomock.InOrder(
385
mockRunner.EXPECT().
386
RunStep(gomock.Any(), gomock.Any(), plan.Steps[0]).
387
Return(types.StepResult{Step: plan.Steps[0], Outcome: types.OutcomeOK}, nil),
388
mockRunner.EXPECT().
389
RunStep(gomock.Any(), gomock.Any(), plan.Steps[1]).
390
Return(types.StepResult{Step: plan.Steps[1], Outcome: types.OutcomeOK}, nil),
391
mockRunner.EXPECT().
392
RunStep(gomock.Any(), gomock.Any(), plan.Steps[2]).
393
Return(types.StepResult{Step: plan.Steps[2], Outcome: types.OutcomeOK}, nil),
394
)
395
396
subject := planexec.NewPlanExecuteAgent(
397
mockClock,
398
mockPlanner,
399
mockRunner,
400
)
401
402
_, err := subject.RunAgentGoal(context.Background(), goal)
403
Expect(err).NotTo(HaveOccurred())
404
})
405
406
it("should accumulate results and render templates for later steps", func() {
407
plan := types.Plan{
408
Goal: goal,
409
Steps: []types.Step{
410
{Type: types.ToolLLM, Description: "step 1", Prompt: "first"},
411
{Type: types.ToolLLM, Description: "step 2", Prompt: "use {{ (index .Results 0).Output }}"},
412
},
413
}
414
415
mockPlanner.EXPECT().
416
Plan(gomock.Any(), goal).
417
Return(plan, nil).
418
Times(1)
419
420
// Step 1 runs normally.
421
mockRunner.EXPECT().
422
RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
423
DoAndReturn(func(_ context.Context, _ types.Config, s types.Step) (types.StepResult, error) {
424
Expect(s).To(Equal(plan.Steps[0])) // no template here
425
return types.StepResult{Step: s, Outcome: types.OutcomeOK, Output: "A"}, nil
426
}).
427
Times(1)
428
429
// Step 2 should arrive rendered.
430
mockRunner.EXPECT().
431
RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
432
DoAndReturn(func(_ context.Context, _ types.Config, s types.Step) (types.StepResult, error) {
433
Expect(s.Type).To(Equal(types.ToolLLM))
434
Expect(s.Prompt).To(Equal("use A"))
435
return types.StepResult{Step: s, Outcome: types.OutcomeOK, Output: "B"}, nil
436
}).
437
Times(1)
438
439
subject := planexec.NewPlanExecuteAgent(mockClock, mockPlanner, mockRunner)
440
_, err := subject.RunAgentGoal(context.Background(), goal)
441
Expect(err).NotTo(HaveOccurred())
442
})
443
444
it("should error if template rendering fails and not call Runner for that step", func() {
445
plan := types.Plan{
446
Goal: goal,
447
Steps: []types.Step{
448
{Type: types.ToolLLM, Description: "step 1", Prompt: "ok"},
449
{Type: types.ToolLLM, Description: "step 2", Prompt: "bad {{ .MissingKey }}"},
450
},
451
}
452
453
mockPlanner.EXPECT().
454
Plan(gomock.Any(), goal).
455
Return(plan, nil).
456
Times(1)
457
458
// Step 1 runs.
459
mockRunner.EXPECT().
460
RunStep(gomock.Any(), gomock.Any(), plan.Steps[0]).
461
Return(types.StepResult{Step: plan.Steps[0], Outcome: types.OutcomeOK, Output: "A"}, nil).
462
Times(1)
463
464
// Step 2 must NOT run (render should fail before Runner call).
465
mockRunner.EXPECT().
466
RunStep(gomock.Any(), gomock.Any(), plan.Steps[1]).
467
Times(0)
468
469
subject := planexec.NewPlanExecuteAgent(mockClock, mockPlanner, mockRunner)
470
471
_, err := subject.RunAgentGoal(context.Background(), goal)
472
Expect(err).To(HaveOccurred())
473
})
474
475
it("should bubble up policy violations (typed) as a stop reason", func() {
476
plan := types.Plan{
477
Goal: goal,
478
Steps: []types.Step{
479
{Type: types.ToolShell, Description: "step 1", Command: "echo", Args: []string{"one"}},
480
{Type: types.ToolShell, Description: "step 2", Command: "echo", Args: []string{"two"}},
481
},
482
}
483
484
mockPlanner.EXPECT().
485
Plan(gomock.Any(), goal).
486
Return(plan, nil).
487
Times(1)
488
489
polErr := core.PolicyDeniedError{
490
Kind: "workdir",
491
Reason: "workdir not allowed",
492
}
493
494
mockRunner.EXPECT().
495
RunStep(gomock.Any(), gomock.Any(), plan.Steps[0]).
496
Return(types.StepResult{}, polErr).
497
Times(1)
498
499
// Guard: step 2 must not run
500
mockRunner.EXPECT().
501
RunStep(gomock.Any(), gomock.Any(), plan.Steps[1]).
502
Times(0)
503
504
subject := planexec.NewPlanExecuteAgent(mockClock, mockPlanner, mockRunner)
505
506
_, err := subject.RunAgentGoal(context.Background(), goal)
507
Expect(err).To(MatchError(polErr))
508
})
509
510
it("should bubble up budget exceeded errors as a stop reason", func() {
511
plan := types.Plan{
512
Goal: goal,
513
Steps: []types.Step{
514
{Type: types.ToolShell, Description: "step 1", Command: "echo", Args: []string{"one"}},
515
{Type: types.ToolShell, Description: "step 2", Command: "echo", Args: []string{"two"}},
516
},
517
}
518
519
mockPlanner.EXPECT().
520
Plan(gomock.Any(), goal).
521
Return(plan, nil).
522
Times(1)
523
524
budgetErr := core.BudgetExceededError{
525
Kind: core.BudgetKindSteps,
526
Limit: 10,
527
Used: 10,
528
Message: "step budget exceeded",
529
}
530
531
mockRunner.EXPECT().
532
RunStep(gomock.Any(), gomock.Any(), plan.Steps[0]).
533
Return(types.StepResult{}, budgetErr).
534
Times(1)
535
536
// Guard: step 2 must not run
537
mockRunner.EXPECT().
538
RunStep(gomock.Any(), gomock.Any(), plan.Steps[1]).
539
Times(0)
540
541
subject := planexec.NewPlanExecuteAgent(mockClock, mockPlanner, mockRunner)
542
543
_, err := subject.RunAgentGoal(context.Background(), goal)
544
Expect(err).To(MatchError(budgetErr))
545
})
546
547
it("resets transcript and prompt-history at the start of each run", func() {
548
plan := types.Plan{
549
Goal: goal,
550
Steps: []types.Step{
551
{Type: types.ToolLLM, Description: "step 1", Prompt: "first"},
552
},
553
}
554
555
// Run #1
556
mockPlanner.EXPECT().
557
Plan(gomock.Any(), goal).
558
Return(plan, nil).
559
Times(1)
560
561
mockRunner.EXPECT().
562
RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
563
Return(types.StepResult{Step: plan.Steps[0], Outcome: types.OutcomeOK, Output: "A"}, nil).
564
Times(1)
565
566
subject := planexec.NewPlanExecuteAgent(mockClock, mockPlanner, mockRunner)
567
568
_, err := subject.RunAgentGoal(context.Background(), goal)
569
Expect(err).NotTo(HaveOccurred())
570
571
// Seed old leftovers that MUST be cleared on next run
572
subject.AddTranscript("OLD_TRANSCRIPT_SHOULD_BE_CLEARED")
573
if subject.PromptHistory != nil {
574
subject.PromptHistory.AppendString("OLD_PROMPT_SHOULD_BE_CLEARED")
575
}
576
577
// Run #2 (same goal is fine)
578
mockPlanner.EXPECT().
579
Plan(gomock.Any(), goal).
580
Return(plan, nil).
581
Times(1)
582
583
mockRunner.EXPECT().
584
RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
585
Return(types.StepResult{Step: plan.Steps[0], Outcome: types.OutcomeOK, Output: "B"}, nil).
586
Times(1)
587
588
_, err = subject.RunAgentGoal(context.Background(), goal)
589
Expect(err).NotTo(HaveOccurred())
590
591
// Assert old content is gone after reset
592
Expect(subject.TranscriptString()).NotTo(ContainSubstring("OLD_TRANSCRIPT_SHOULD_BE_CLEARED"))
593
594
// And prompt history (if enabled) should not contain the old string either
595
if subject.PromptHistory != nil {
596
Expect(subject.PromptHistory.String()).NotTo(ContainSubstring("OLD_PROMPT_SHOULD_BE_CLEARED"))
597
}
598
599
// Sanity: new run wrote fresh markers
600
Expect(subject.TranscriptString()).To(ContainSubstring("[goal]"))
601
Expect(subject.TranscriptString()).To(ContainSubstring(goal))
602
})
603
604
it("caps transcript growth (truncates) when steps produce large transcripts", func() {
605
plan := types.Plan{
606
Goal: goal,
607
Steps: []types.Step{
608
{Type: types.ToolShell, Description: "step 1", Command: "echo", Args: []string{"hi"}},
609
},
610
}
611
612
mockPlanner.EXPECT().
613
Plan(gomock.Any(), goal).
614
Return(plan, nil).
615
Times(1)
616
617
// Runner returns a huge transcript payload
618
big := strings.Repeat("X", 10_000)
619
620
mockRunner.EXPECT().
621
RunStep(gomock.Any(), gomock.Any(), gomock.Any()).
622
Return(types.StepResult{
623
Step: plan.Steps[0],
624
Outcome: types.OutcomeOK,
625
Output: "ok",
626
Transcript: big,
627
}, nil).
628
Times(1)
629
630
subject := planexec.NewPlanExecuteAgent(
631
mockClock,
632
mockPlanner,
633
mockRunner,
634
core.WithTranscriptMaxBytes(200),
635
)
636
637
_, err := subject.RunAgentGoal(context.Background(), goal)
638
Expect(err).NotTo(HaveOccurred())
639
640
ts := subject.TranscriptString()
641
642
// Hard check: buffer is capped
643
Expect(len([]byte(ts))).To(BeNumerically("<=", 200))
644
645
// And should include your truncation banner (whatever you used in transcript_buffer)
646
Expect(ts).To(ContainSubstring("…(truncated)"))
647
})
648
})
649
}
650
651
func expectAgentDuration(mockClock *MockClock, d time.Duration) {
652
t0 := time.Date(2026, 1, 13, 9, 0, 0, 0, time.UTC)
653
t1 := t0.Add(d)
654
655
// Robust: first call is t0, all subsequent calls are t1.
656
gomock.InOrder(
657
mockClock.EXPECT().Now().Return(t0).Times(1),
658
mockClock.EXPECT().Now().Return(t1).AnyTimes(),
659
)
660
}
661
662