CoCalc -- admin-llm-test.tsx

GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/packages/frontend/admin/llm/admin-llm-test.tsx
¹⁶⁹¹ views
1
import {
2
  Alert,
3
  Button,
4
  Input,
5
  Progress,
6
  Select,
7
  Space,
8
  Table,
9
  Tooltip,
10
} from "antd";
11

12
import {
13
  redux,
14
  useAsyncEffect,
15
  useState,
16
  useTypedRedux,
17
} from "@cocalc/frontend/app-framework";
18
import { Icon, Loading, Paragraph, Title } from "@cocalc/frontend/components";
19
import { LLMModelName } from "@cocalc/frontend/components/llm-name";
20
import { Markdown } from "@cocalc/frontend/markdown";
21
import { webapp_client } from "@cocalc/frontend/webapp-client";
22
import {
23
  USER_SELECTABLE_LLMS_BY_VENDOR,
24
  isCoreLanguageModel,
25
  toCustomOpenAIModel,
26
  toOllamaModel,
27
} from "@cocalc/util/db-schema/llm-utils";
28
import { trunc_middle } from "@cocalc/util/misc";
29
import { COLORS } from "@cocalc/util/theme";
30
import { PROMPTS } from "./tests";
31
import { Value } from "./value";
32
interface TestResult {
33
  model: string;
34
  status: "pending" | "running" | "passed" | "failed";
35
  output: string;
36
  error?: string;
37
  firstResponseTime?: number; // Time in milliseconds until first token
38
  totalTime?: number; // Total time in milliseconds until completion
39
}
40

41
export function TestLLMAdmin() {
42
  const customize = redux.getStore("customize");
43
  const globallyEnabledLLMs = customize.getEnabledLLMs();
44
  const selectableLLMs = useTypedRedux("customize", "selectable_llms");
45
  const ollama = useTypedRedux("customize", "ollama");
46
  const custom_openai = useTypedRedux("customize", "custom_openai");
47
  const [test, setTest] = useState<number | null>(0);
48
  const [querying, setQuerying] = useState<boolean>(false);
49
  const [testResults, setTestResults] = useState<TestResult[]>([]);
50
  const [currentTestIndex, setCurrentTestIndex] = useState<number>(0);
51

52
  // Initialize test results on component mount or when test changes
53
  useAsyncEffect(() => {
54
    if (test !== null) {
55
      const allModels = getAllModels();
56
      const initialResults: TestResult[] = allModels.map((model) => ({
57
        model,
58
        status: "pending",
59
        output: "",
60
      }));
61
      setTestResults(initialResults);
62
    } else {
63
      setTestResults([]);
64
    }
65
  }, [test, custom_openai, ollama, selectableLLMs]);
66

67
  function getAllModels(): string[] {
68
    const models: string[] = [];
69

70
    // Get core models
71
    Object.entries(USER_SELECTABLE_LLMS_BY_VENDOR).forEach(([vendor, llms]) => {
72
      if (vendor !== "ollama" && vendor !== "custom_openai") {
73
        llms.filter(isCoreLanguageModel).forEach((llm) => {
74
          models.push(llm);
75
        });
76
      }
77
    });
78

79
    // Get custom OpenAI models
80
    Object.entries(custom_openai?.toJS() ?? {}).forEach(([key, _val]) => {
81
      const model = toCustomOpenAIModel(key);
82
      models.push(model);
83
    });
84

85
    // Get Ollama models
86
    Object.entries(ollama?.toJS() ?? {}).forEach(([key, _val]) => {
87
      const model = toOllamaModel(key);
88
      models.push(model);
89
    });
90

91
    return models;
92
  }
93

94
  function getEnabledModels(): string[] {
95
    return getAllModels().filter((model) => {
96
      // Check if model is enabled in selectable LLMs
97
      if (isCoreLanguageModel(model)) {
98
        return selectableLLMs.includes(model);
99
      }
100
      // Custom OpenAI and Ollama models are always considered enabled if configured
101
      return true;
102
    });
103
  }
104

105
  async function runTestForModel(
106
    model: string,
107
    testConfig: any,
108
  ): Promise<TestResult> {
109
    const { prompt, expected, system, history } = testConfig;
110
    const expectedRegex = new RegExp(expected, "g");
111

112
    return new Promise((resolve) => {
113
      try {
114
        const startTime = Date.now();
115
        let firstResponseTime: number | undefined;
116
        let totalTime: number | undefined;
117

118
        const llmStream = webapp_client.openai_client.queryStream({
119
          input: prompt,
120
          project_id: null,
121
          tag: "admin-llm-test",
122
          model,
123
          system,
124
          history,
125
          maxTokens: 20,
126
        });
127

128
        let reply = "";
129

130
        llmStream.on("token", (token) => {
131
          console.log({ model, system, token });
132
          if (token != null) {
133
            // Record first response time if this is the first token
134
            if (firstResponseTime === undefined) {
135
              firstResponseTime = Date.now() - startTime;
136
            }
137
            reply += token;
138
            // Update the result in real-time
139
            setTestResults((prev) =>
140
              prev.map((r) =>
141
                r.model === model ? { ...r, output: reply } : r,
142
              ),
143
            );
144
          } else {
145
            // Stream is complete (token is null)
146
            totalTime = Date.now() - startTime;
147
            const passed = expectedRegex.test(reply);
148
            resolve({
149
              model,
150
              status: passed ? "passed" : "failed",
151
              output: reply,
152
              firstResponseTime,
153
              totalTime,
154
            });
155
          }
156
        });
157

158
        llmStream.on("error", (err) => {
159
          totalTime = Date.now() - startTime;
160
          console.error(`Error in LLM stream for model ${model}:`, err);
161
          resolve({
162
            model,
163
            status: "failed",
164
            output: reply,
165
            error: err?.toString(),
166
            firstResponseTime,
167
            totalTime,
168
          });
169
        });
170

171
        // Start the stream
172
        llmStream.emit("start");
173
      } catch (err) {
174
        console.error(`Error running test for model ${model}:`, err);
175
        resolve({
176
          model,
177
          status: "failed",
178
          output: "",
179
          error: err?.toString(),
180
        });
181
      }
182
    });
183
  }
184

185
  async function runSingleTest(model: string) {
186
    if (test === null) return;
187

188
    const testConfig = PROMPTS[test];
189

190
    // Find the model in the results and update its status
191
    const modelIndex = testResults.findIndex((r) => r.model === model);
192
    if (modelIndex === -1) return;
193

194
    setCurrentTestIndex(modelIndex);
195

196
    // Update status to running
197
    setTestResults((prev) =>
198
      prev.map((r, idx) =>
199
        idx === modelIndex
200
          ? { ...r, status: "running", output: "", error: undefined }
201
          : r,
202
      ),
203
    );
204

205
    const result = await runTestForModel(model, testConfig);
206

207
    // Update with final result
208
    setTestResults((prev) =>
209
      prev.map((r, idx) => (idx === modelIndex ? result : r)),
210
    );
211
  }
212

213
  async function runSequentialTests() {
214
    if (test === null) return;
215

216
    const models = getEnabledModels();
217
    const testConfig = PROMPTS[test];
218

219
    // Initialize results
220
    const initialResults: TestResult[] = models.map((model) => ({
221
      model,
222
      status: "pending",
223
      output: "",
224
    }));
225

226
    setTestResults(initialResults);
227
    setQuerying(true);
228
    setCurrentTestIndex(0);
229

230
    // Run tests sequentially
231
    for (let i = 0; i < models.length; i++) {
232
      setCurrentTestIndex(i);
233

234
      // Update status to running
235
      setTestResults((prev) =>
236
        prev.map((r, idx) => (idx === i ? { ...r, status: "running" } : r)),
237
      );
238

239
      const result = await runTestForModel(models[i], testConfig);
240

241
      // Update with final result
242
      setTestResults((prev) => prev.map((r, idx) => (idx === i ? result : r)));
243

244
      // Add delay between tests to avoid rate limiting
245
      if (i < models.length - 1) {
246
        await new Promise((resolve) => setTimeout(resolve, 100));
247
      }
248
    }
249

250
    setQuerying(false);
251
  }
252

253
  function renderTestResultIcon(status: TestResult["status"]) {
254
    switch (status) {
255
      case "pending":
256
        return <Icon unicode={0x2753} />;
257
      case "running":
258
        return <Loading text="" />;
259
      case "passed":
260
        return <Value val={true} />;
261
      case "failed":
262
        return <Value val={false} />;
263
      default:
264
        return <Icon unicode={0x2753} />;
265
    }
266
  }
267

268
  function formatTiming(timeMs: number | undefined): string {
269
    if (timeMs === undefined) return "-";
270
    return `${(timeMs / 1000).toFixed(1)}s`;
271
  }
272

273
  function renderTimingColumn(record: TestResult) {
274
    const { firstResponseTime, totalTime, status } = record;
275

276
    if (status === "pending" || status === "running") {
277
      return <span style={{ color: COLORS.GRAY_M }}>-</span>;
278
    }
279

280
    if (firstResponseTime === undefined || totalTime === undefined) {
281
      return <span style={{ color: COLORS.GRAY_M }}>-</span>;
282
    }
283

284
    // Calculate progress bar values (normalize to 10 seconds max)
285
    const maxTime = Math.max(
286
      10000,
287
      ...testResults.filter((r) => r.totalTime).map((r) => r.totalTime!),
288
    );
289
    const totalPercent = Math.min(100, (totalTime / maxTime) * 100);
290

291
    // Determine if this is one of the slowest (top 10% quantile)
292
    const completedResults = testResults.filter(
293
      (r) => r.totalTime !== undefined,
294
    );
295
    const sortedTimes = completedResults
296
      .map((r) => r.totalTime!)
297
      .sort((a, b) => b - a);
298
    const slowThreshold =
299
      sortedTimes[Math.floor(sortedTimes.length * 0.1)] || 0;
300
    const isSlow = totalTime >= slowThreshold && completedResults.length > 1;
301

302
    return (
303
      <div>
304
        <Tooltip title="First response time / Total completion time">
305
          <div style={{ marginBottom: 2 }}>
306
            {formatTiming(firstResponseTime)}/{formatTiming(totalTime)}
307
          </div>
308
        </Tooltip>
309
        <Progress
310
          percent={totalPercent}
311
          size="small"
312
          status={isSlow ? "exception" : "normal"}
313
          showInfo={false}
314
        />
315
      </div>
316
    );
317
  }
318

319
  function renderTestResults() {
320
    if (testResults.length === 0) {
321
      return (
322
        <Paragraph>
323
          Click "Run Tests" to execute the selected test on all enabled models.
324
        </Paragraph>
325
      );
326
    }
327

328
    const columns = [
329
      {
330
        title: "Status",
331
        dataIndex: "status",
332
        key: "status",
333
        width: 80,
334
        render: (status: TestResult["status"]) => renderTestResultIcon(status),
335
      },
336
      {
337
        title: "Model",
338
        dataIndex: "model",
339
        key: "model",
340
        width: 180,
341
        render: (model: string /*, record: TestResult*/) => (
342
          <Space>
343
            <LLMModelName model={model} />
344
            {/* {record.status === "running" && <span>(Running...)</span>} */}
345
          </Space>
346
        ),
347
      },
348
      {
349
        title: "Output",
350
        dataIndex: "output",
351
        key: "output",
352
        render: (output: string) =>
353
          output ? (
354
            <Markdown value={output} />
355
          ) : (
356
            <span style={{ color: COLORS.GRAY_M }}>-</span>
357
          ),
358
      },
359
      {
360
        title: "Error",
361
        dataIndex: "error",
362
        key: "error",
363
        render: (error: string) =>
364
          error ? (
365
            <Alert type="error" banner message={error} style={{ margin: 0 }} />
366
          ) : (
367
            <span style={{ color: COLORS.GRAY_M }}>-</span>
368
          ),
369
      },
370
      {
371
        title: "Timing",
372
        key: "timing",
373
        width: 120,
374
        render: (_, record: TestResult) => renderTimingColumn(record),
375
      },
376
      {
377
        title: "Test",
378
        key: "test",
379
        width: 80,
380
        render: (_, record: TestResult) => {
381
          const isEnabled = getEnabledModels().includes(record.model);
382
          const isRunning = record.status === "running";
383
          const isQuerying = querying && record.status === "running";
384

385
          return (
386
            <Button
387
              type="primary"
388
              size="small"
389
              disabled={test === null || !isEnabled || isQuerying}
390
              loading={isRunning}
391
              onClick={() => runSingleTest(record.model)}
392
              style={{ width: "60px" }}
393
            >
394
              {isRunning ? "" : "Run"}
395
            </Button>
396
          );
397
        },
398
      },
399
    ];
400

401
    const dataSource = testResults.map((result, index) => ({
402
      ...result,
403
      key: result.model,
404
      // Add row styling for currently running test
405
      className:
406
        index === currentTestIndex && querying ? "running-row" : undefined,
407
    }));
408

409
    return (
410
      <div>
411
        <Title level={4}>Test Results</Title>
412
        <Table
413
          columns={columns}
414
          dataSource={dataSource}
415
          pagination={false}
416
          size="small"
417
          rowClassName={(_, index) =>
418
            index === currentTestIndex && querying
419
              ? "admin-llm-test-running-row"
420
              : ""
421
          }
422
          style={{ marginTop: "10px" }}
423
        />
424
      </div>
425
    );
426
  }
427

428
  return (
429
    <div>
430
      <Paragraph>
431
        Globally enabled LLMs (Admin Settings):
432
        <Value val={globallyEnabledLLMs} />.
433
      </Paragraph>
434
      <Paragraph>
435
        <Space>
436
          <Input
437
            value={test != null ? PROMPTS[test].prompt : ""}
438
            disabled={true || querying}
439
            onChange={(e) => setTest(parseInt(e.target.value))}
440
            placeholder="Enter a query..."
441
            addonAfter={
442
              <Select
443
                onSelect={setTest}
444
                defaultValue={0}
445
                popupMatchSelectWidth={false}
446
              >
447
                {PROMPTS.map((p, i) => (
448
                  <Select.Option key={i} value={i}>
449
                    {trunc_middle(p.prompt, 25)}
450
                  </Select.Option>
451
                ))}
452
              </Select>
453
            }
454
          />
455
          <Button
456
            type="primary"
457
            onClick={runSequentialTests}
458
            disabled={test == null || querying}
459
          >
460
            Run Tests
461
          </Button>
462
          <Button
463
            onClick={() => {
464
              setTest(null);
465
              setTestResults([]);
466
            }}
467
          >
468
            Clear
469
          </Button>
470
        </Space>
471
      </Paragraph>
472

473
      {renderTestResults()}
474

475
      <Title level={5}>Ollama configuration</Title>
476
      <Value val={ollama} />
477
      <Title level={5}>Custom OpenAI API</Title>
478
      <Value val={custom_openai} />
479
    </div>
480
  );
481
}
482

483
Product

Resources

Company