Commit
•
2195fa8
1
Parent(s):
93d4ebf
Downloading instead of hardcoding llmperf
Browse files- llmperf/.gitignore +0 -247
- llmperf/LICENSE.txt +0 -202
- llmperf/NOTICE.txt +0 -14
- llmperf/README.md +0 -415
- llmperf/analyze-token-benchmark-results.ipynb +0 -327
- llmperf/llm_correctness.py +0 -309
- llmperf/pre-commit.sh +0 -5
- llmperf/pyproject.toml +0 -23
- llmperf/requirements-dev.txt +0 -2
- llmperf/src/llmperf/__init__.py +0 -1
- llmperf/src/llmperf/common.py +0 -38
- llmperf/src/llmperf/common_metrics.py +0 -17
- llmperf/src/llmperf/models.py +0 -21
- llmperf/src/llmperf/ray_clients/__init__.py +0 -0
- llmperf/src/llmperf/ray_clients/litellm_client.py +0 -100
- llmperf/src/llmperf/ray_clients/openai_chat_completions_client.py +0 -120
- llmperf/src/llmperf/ray_clients/sagemaker_client.py +0 -158
- llmperf/src/llmperf/ray_clients/vertexai_client.py +0 -135
- llmperf/src/llmperf/ray_llm_client.py +0 -22
- llmperf/src/llmperf/requests_launcher.py +0 -48
- llmperf/src/llmperf/sonnet.txt +0 -84
- llmperf/src/llmperf/utils.py +0 -147
- llmperf/token_benchmark_ray.py +0 -469
- on_startup.sh +6 -0
- requirements.txt +7 -6
llmperf/.gitignore
DELETED
@@ -1,247 +0,0 @@
|
|
1 |
-
# The build output should clearly not be checked in
|
2 |
-
*test-output.xml
|
3 |
-
/bazel-*
|
4 |
-
/python/ray/core
|
5 |
-
/python/ray/pickle5_files/
|
6 |
-
/python/ray/thirdparty_files/
|
7 |
-
/python/ray/pyarrow_files/
|
8 |
-
/python/ray/jars/
|
9 |
-
/python/ray/cpp/
|
10 |
-
/python/build
|
11 |
-
/python/dist
|
12 |
-
/python/python-driver-*
|
13 |
-
/python/ray/serve/generated
|
14 |
-
/thirdparty/pkg/
|
15 |
-
/build/java
|
16 |
-
.jar
|
17 |
-
/dashboard/client/build
|
18 |
-
|
19 |
-
# Files generated by flatc should be ignored
|
20 |
-
/src/ray/gcs/format/*_generated.h
|
21 |
-
/src/ray/object_manager/format/*_generated.h
|
22 |
-
/src/ray/raylet/format/*_generated.h
|
23 |
-
/java/runtime/src/main/java/io/ray/runtime/generated/*
|
24 |
-
/java/serve/src/main/java/io/ray/serve/generated/*
|
25 |
-
|
26 |
-
# Files genrated by c++ worker should be ignored.
|
27 |
-
/cpp/example/thirdparty/
|
28 |
-
/cpp/example/bazel-*
|
29 |
-
/python/ray/cpp
|
30 |
-
|
31 |
-
# Redis temporary files
|
32 |
-
*dump.rdb
|
33 |
-
|
34 |
-
# Python byte code files
|
35 |
-
*.pyc
|
36 |
-
python/.eggs
|
37 |
-
*.egg-info
|
38 |
-
|
39 |
-
# Backup files
|
40 |
-
*.bak
|
41 |
-
|
42 |
-
# Emacs temporary files
|
43 |
-
*~
|
44 |
-
*#
|
45 |
-
|
46 |
-
# Compiled Object files
|
47 |
-
*.slo
|
48 |
-
*.lo
|
49 |
-
*.o
|
50 |
-
*.xo
|
51 |
-
*.obj
|
52 |
-
|
53 |
-
# Precompiled Headers
|
54 |
-
*.gch
|
55 |
-
*.pch
|
56 |
-
|
57 |
-
# Compiled Dynamic libraries
|
58 |
-
*.so
|
59 |
-
*.dylib
|
60 |
-
*.dll
|
61 |
-
python/ray/_raylet.pyd
|
62 |
-
|
63 |
-
# Incremental linking files
|
64 |
-
*.ilk
|
65 |
-
|
66 |
-
# Library export files
|
67 |
-
*.exp
|
68 |
-
|
69 |
-
# Debug symbols
|
70 |
-
*.pdb
|
71 |
-
|
72 |
-
# Fortran module files
|
73 |
-
*.mod
|
74 |
-
!deploy/ray-operator/go.mod
|
75 |
-
|
76 |
-
# Compiled Static libraries
|
77 |
-
*.lai
|
78 |
-
*.la
|
79 |
-
*.a
|
80 |
-
*.lib
|
81 |
-
|
82 |
-
# Executables
|
83 |
-
*.exe
|
84 |
-
*.out
|
85 |
-
*.app
|
86 |
-
|
87 |
-
# Visual Studio files
|
88 |
-
/packages
|
89 |
-
*.suo
|
90 |
-
*.user
|
91 |
-
*.VC.db
|
92 |
-
*.VC.opendb
|
93 |
-
|
94 |
-
# Protobuf-generated files
|
95 |
-
*_pb2.py
|
96 |
-
*.pb.h
|
97 |
-
*.pb.cc
|
98 |
-
|
99 |
-
# Ray cluster configuration
|
100 |
-
scripts/nodes.txt
|
101 |
-
|
102 |
-
# OS X folder attributes
|
103 |
-
.DS_Store
|
104 |
-
|
105 |
-
# Debug files
|
106 |
-
*.dSYM/
|
107 |
-
*.su
|
108 |
-
|
109 |
-
# Python setup files
|
110 |
-
*.egg-info
|
111 |
-
|
112 |
-
# Compressed files
|
113 |
-
*.gz
|
114 |
-
|
115 |
-
# Datasets from examples
|
116 |
-
**/MNIST_data/
|
117 |
-
**/cifar-10-batches-bin/
|
118 |
-
|
119 |
-
# Generated documentation files
|
120 |
-
/doc/_build
|
121 |
-
/doc/source/_static/thumbs
|
122 |
-
/doc/source/tune/generated_guides/
|
123 |
-
/doc/source/**/doc/
|
124 |
-
|
125 |
-
# User-specific stuff:
|
126 |
-
.idea/**/workspace.xml
|
127 |
-
.idea/**/tasks.xml
|
128 |
-
.idea/dictionaries
|
129 |
-
.llvm-local.bazelrc
|
130 |
-
|
131 |
-
# Sensitive or high-churn files:
|
132 |
-
.idea/**/dataSources/
|
133 |
-
.idea/**/dataSources.ids
|
134 |
-
.idea/**/dataSources.xml
|
135 |
-
.idea/**/dataSources.local.xml
|
136 |
-
.idea/**/sqlDataSources.xml
|
137 |
-
.idea/**/dynamic.xml
|
138 |
-
.idea/**/uiDesigner.xml
|
139 |
-
|
140 |
-
# Gradle:
|
141 |
-
.idea/**/gradle.xml
|
142 |
-
.idea/**/libraries
|
143 |
-
.idea
|
144 |
-
|
145 |
-
# Website
|
146 |
-
/site/Gemfile.lock
|
147 |
-
/site/.sass-cache
|
148 |
-
/site/_site
|
149 |
-
|
150 |
-
# Pytest Cache
|
151 |
-
**/.pytest_cache
|
152 |
-
**/.cache
|
153 |
-
.benchmarks
|
154 |
-
python-driver-*
|
155 |
-
|
156 |
-
# Vscode
|
157 |
-
.vscode/
|
158 |
-
|
159 |
-
*.iml
|
160 |
-
|
161 |
-
# Java
|
162 |
-
java/**/target
|
163 |
-
java/**/lib
|
164 |
-
java/**/.settings
|
165 |
-
java/**/.classpath
|
166 |
-
java/**/.project
|
167 |
-
java/runtime/native_dependencies/
|
168 |
-
java/testng_custom.xml
|
169 |
-
|
170 |
-
dependency-reduced-pom.xml
|
171 |
-
|
172 |
-
# Cpp
|
173 |
-
cpp/example/thirdparty/
|
174 |
-
|
175 |
-
.clwb
|
176 |
-
|
177 |
-
# pom.xml files generated from pom_template.xml
|
178 |
-
java/**/pom.xml
|
179 |
-
|
180 |
-
# python virtual env
|
181 |
-
venv
|
182 |
-
|
183 |
-
# pyenv version file
|
184 |
-
.python-version
|
185 |
-
|
186 |
-
# Vim
|
187 |
-
.*.swp
|
188 |
-
*.swp
|
189 |
-
.*.swo
|
190 |
-
*.swo
|
191 |
-
tags
|
192 |
-
tags.lock
|
193 |
-
tags.temp
|
194 |
-
*.vim
|
195 |
-
|
196 |
-
# Emacs
|
197 |
-
.#*
|
198 |
-
|
199 |
-
# tools
|
200 |
-
tools/prometheus*
|
201 |
-
|
202 |
-
# ray project files
|
203 |
-
project-id
|
204 |
-
.mypy_cache/
|
205 |
-
|
206 |
-
# release test related
|
207 |
-
.anyscale.yaml
|
208 |
-
test_state.json
|
209 |
-
|
210 |
-
# workflow storage
|
211 |
-
workflow_data/
|
212 |
-
|
213 |
-
# vscode java extention generated
|
214 |
-
.factorypath
|
215 |
-
|
216 |
-
# Jupyter Notebooks
|
217 |
-
**/.ipynb_checkpoints/
|
218 |
-
|
219 |
-
### Added by Hedron's Bazel Compile Commands Extractor: https://github.com/hedronvision/bazel-compile-commands-extractor
|
220 |
-
# The external link: Differs on Windows vs macOS/Linux, so we can't check it in. The pattern needs to not have a trailing / because it's a symlink on macOS/Linux.
|
221 |
-
/external
|
222 |
-
# Compiled output -> don't check in
|
223 |
-
/compile_commands.json
|
224 |
-
# Directory where clangd puts its indexing work
|
225 |
-
/.cache/
|
226 |
-
|
227 |
-
# Auto-generated tag mapping
|
228 |
-
tag-mapping.json
|
229 |
-
|
230 |
-
.bazeliskrc
|
231 |
-
|
232 |
-
# ignore tmp files
|
233 |
-
*.tmp
|
234 |
-
out
|
235 |
-
temp*
|
236 |
-
|
237 |
-
# build output
|
238 |
-
build/
|
239 |
-
dist/
|
240 |
-
|
241 |
-
# results
|
242 |
-
output/
|
243 |
-
*.json
|
244 |
-
result_outputs/
|
245 |
-
|
246 |
-
__pycache__
|
247 |
-
**/__pycache__/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llmperf/LICENSE.txt
DELETED
@@ -1,202 +0,0 @@
|
|
1 |
-
|
2 |
-
Apache License
|
3 |
-
Version 2.0, January 2004
|
4 |
-
http://www.apache.org/licenses/
|
5 |
-
|
6 |
-
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
7 |
-
|
8 |
-
1. Definitions.
|
9 |
-
|
10 |
-
"License" shall mean the terms and conditions for use, reproduction,
|
11 |
-
and distribution as defined by Sections 1 through 9 of this document.
|
12 |
-
|
13 |
-
"Licensor" shall mean the copyright owner or entity authorized by
|
14 |
-
the copyright owner that is granting the License.
|
15 |
-
|
16 |
-
"Legal Entity" shall mean the union of the acting entity and all
|
17 |
-
other entities that control, are controlled by, or are under common
|
18 |
-
control with that entity. For the purposes of this definition,
|
19 |
-
"control" means (i) the power, direct or indirect, to cause the
|
20 |
-
direction or management of such entity, whether by contract or
|
21 |
-
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
22 |
-
outstanding shares, or (iii) beneficial ownership of such entity.
|
23 |
-
|
24 |
-
"You" (or "Your") shall mean an individual or Legal Entity
|
25 |
-
exercising permissions granted by this License.
|
26 |
-
|
27 |
-
"Source" form shall mean the preferred form for making modifications,
|
28 |
-
including but not limited to software source code, documentation
|
29 |
-
source, and configuration files.
|
30 |
-
|
31 |
-
"Object" form shall mean any form resulting from mechanical
|
32 |
-
transformation or translation of a Source form, including but
|
33 |
-
not limited to compiled object code, generated documentation,
|
34 |
-
and conversions to other media types.
|
35 |
-
|
36 |
-
"Work" shall mean the work of authorship, whether in Source or
|
37 |
-
Object form, made available under the License, as indicated by a
|
38 |
-
copyright notice that is included in or attached to the work
|
39 |
-
(an example is provided in the Appendix below).
|
40 |
-
|
41 |
-
"Derivative Works" shall mean any work, whether in Source or Object
|
42 |
-
form, that is based on (or derived from) the Work and for which the
|
43 |
-
editorial revisions, annotations, elaborations, or other modifications
|
44 |
-
represent, as a whole, an original work of authorship. For the purposes
|
45 |
-
of this License, Derivative Works shall not include works that remain
|
46 |
-
separable from, or merely link (or bind by name) to the interfaces of,
|
47 |
-
the Work and Derivative Works thereof.
|
48 |
-
|
49 |
-
"Contribution" shall mean any work of authorship, including
|
50 |
-
the original version of the Work and any modifications or additions
|
51 |
-
to that Work or Derivative Works thereof, that is intentionally
|
52 |
-
submitted to Licensor for inclusion in the Work by the copyright owner
|
53 |
-
or by an individual or Legal Entity authorized to submit on behalf of
|
54 |
-
the copyright owner. For the purposes of this definition, "submitted"
|
55 |
-
means any form of electronic, verbal, or written communication sent
|
56 |
-
to the Licensor or its representatives, including but not limited to
|
57 |
-
communication on electronic mailing lists, source code control systems,
|
58 |
-
and issue tracking systems that are managed by, or on behalf of, the
|
59 |
-
Licensor for the purpose of discussing and improving the Work, but
|
60 |
-
excluding communication that is conspicuously marked or otherwise
|
61 |
-
designated in writing by the copyright owner as "Not a Contribution."
|
62 |
-
|
63 |
-
"Contributor" shall mean Licensor and any individual or Legal Entity
|
64 |
-
on behalf of whom a Contribution has been received by Licensor and
|
65 |
-
subsequently incorporated within the Work.
|
66 |
-
|
67 |
-
2. Grant of Copyright License. Subject to the terms and conditions of
|
68 |
-
this License, each Contributor hereby grants to You a perpetual,
|
69 |
-
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
70 |
-
copyright license to reproduce, prepare Derivative Works of,
|
71 |
-
publicly display, publicly perform, sublicense, and distribute the
|
72 |
-
Work and such Derivative Works in Source or Object form.
|
73 |
-
|
74 |
-
3. Grant of Patent License. Subject to the terms and conditions of
|
75 |
-
this License, each Contributor hereby grants to You a perpetual,
|
76 |
-
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
77 |
-
(except as stated in this section) patent license to make, have made,
|
78 |
-
use, offer to sell, sell, import, and otherwise transfer the Work,
|
79 |
-
where such license applies only to those patent claims licensable
|
80 |
-
by such Contributor that are necessarily infringed by their
|
81 |
-
Contribution(s) alone or by combination of their Contribution(s)
|
82 |
-
with the Work to which such Contribution(s) was submitted. If You
|
83 |
-
institute patent litigation against any entity (including a
|
84 |
-
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
85 |
-
or a Contribution incorporated within the Work constitutes direct
|
86 |
-
or contributory patent infringement, then any patent licenses
|
87 |
-
granted to You under this License for that Work shall terminate
|
88 |
-
as of the date such litigation is filed.
|
89 |
-
|
90 |
-
4. Redistribution. You may reproduce and distribute copies of the
|
91 |
-
Work or Derivative Works thereof in any medium, with or without
|
92 |
-
modifications, and in Source or Object form, provided that You
|
93 |
-
meet the following conditions:
|
94 |
-
|
95 |
-
(a) You must give any other recipients of the Work or
|
96 |
-
Derivative Works a copy of this License; and
|
97 |
-
|
98 |
-
(b) You must cause any modified files to carry prominent notices
|
99 |
-
stating that You changed the files; and
|
100 |
-
|
101 |
-
(c) You must retain, in the Source form of any Derivative Works
|
102 |
-
that You distribute, all copyright, patent, trademark, and
|
103 |
-
attribution notices from the Source form of the Work,
|
104 |
-
excluding those notices that do not pertain to any part of
|
105 |
-
the Derivative Works; and
|
106 |
-
|
107 |
-
(d) If the Work includes a "NOTICE" text file as part of its
|
108 |
-
distribution, then any Derivative Works that You distribute must
|
109 |
-
include a readable copy of the attribution notices contained
|
110 |
-
within such NOTICE file, excluding those notices that do not
|
111 |
-
pertain to any part of the Derivative Works, in at least one
|
112 |
-
of the following places: within a NOTICE text file distributed
|
113 |
-
as part of the Derivative Works; within the Source form or
|
114 |
-
documentation, if provided along with the Derivative Works; or,
|
115 |
-
within a display generated by the Derivative Works, if and
|
116 |
-
wherever such third-party notices normally appear. The contents
|
117 |
-
of the NOTICE file are for informational purposes only and
|
118 |
-
do not modify the License. You may add Your own attribution
|
119 |
-
notices within Derivative Works that You distribute, alongside
|
120 |
-
or as an addendum to the NOTICE text from the Work, provided
|
121 |
-
that such additional attribution notices cannot be construed
|
122 |
-
as modifying the License.
|
123 |
-
|
124 |
-
You may add Your own copyright statement to Your modifications and
|
125 |
-
may provide additional or different license terms and conditions
|
126 |
-
for use, reproduction, or distribution of Your modifications, or
|
127 |
-
for any such Derivative Works as a whole, provided Your use,
|
128 |
-
reproduction, and distribution of the Work otherwise complies with
|
129 |
-
the conditions stated in this License.
|
130 |
-
|
131 |
-
5. Submission of Contributions. Unless You explicitly state otherwise,
|
132 |
-
any Contribution intentionally submitted for inclusion in the Work
|
133 |
-
by You to the Licensor shall be under the terms and conditions of
|
134 |
-
this License, without any additional terms or conditions.
|
135 |
-
Notwithstanding the above, nothing herein shall supersede or modify
|
136 |
-
the terms of any separate license agreement you may have executed
|
137 |
-
with Licensor regarding such Contributions.
|
138 |
-
|
139 |
-
6. Trademarks. This License does not grant permission to use the trade
|
140 |
-
names, trademarks, service marks, or product names of the Licensor,
|
141 |
-
except as required for reasonable and customary use in describing the
|
142 |
-
origin of the Work and reproducing the content of the NOTICE file.
|
143 |
-
|
144 |
-
7. Disclaimer of Warranty. Unless required by applicable law or
|
145 |
-
agreed to in writing, Licensor provides the Work (and each
|
146 |
-
Contributor provides its Contributions) on an "AS IS" BASIS,
|
147 |
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
148 |
-
implied, including, without limitation, any warranties or conditions
|
149 |
-
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
150 |
-
PARTICULAR PURPOSE. You are solely responsible for determining the
|
151 |
-
appropriateness of using or redistributing the Work and assume any
|
152 |
-
risks associated with Your exercise of permissions under this License.
|
153 |
-
|
154 |
-
8. Limitation of Liability. In no event and under no legal theory,
|
155 |
-
whether in tort (including negligence), contract, or otherwise,
|
156 |
-
unless required by applicable law (such as deliberate and grossly
|
157 |
-
negligent acts) or agreed to in writing, shall any Contributor be
|
158 |
-
liable to You for damages, including any direct, indirect, special,
|
159 |
-
incidental, or consequential damages of any character arising as a
|
160 |
-
result of this License or out of the use or inability to use the
|
161 |
-
Work (including but not limited to damages for loss of goodwill,
|
162 |
-
work stoppage, computer failure or malfunction, or any and all
|
163 |
-
other commercial damages or losses), even if such Contributor
|
164 |
-
has been advised of the possibility of such damages.
|
165 |
-
|
166 |
-
9. Accepting Warranty or Additional Liability. While redistributing
|
167 |
-
the Work or Derivative Works thereof, You may choose to offer,
|
168 |
-
and charge a fee for, acceptance of support, warranty, indemnity,
|
169 |
-
or other liability obligations and/or rights consistent with this
|
170 |
-
License. However, in accepting such obligations, You may act only
|
171 |
-
on Your own behalf and on Your sole responsibility, not on behalf
|
172 |
-
of any other Contributor, and only if You agree to indemnify,
|
173 |
-
defend, and hold each Contributor harmless for any liability
|
174 |
-
incurred by, or claims asserted against, such Contributor by reason
|
175 |
-
of your accepting any such warranty or additional liability.
|
176 |
-
|
177 |
-
END OF TERMS AND CONDITIONS
|
178 |
-
|
179 |
-
APPENDIX: How to apply the Apache License to your work.
|
180 |
-
|
181 |
-
To apply the Apache License to your work, attach the following
|
182 |
-
boilerplate notice, with the fields enclosed by brackets "[]"
|
183 |
-
replaced with your own identifying information. (Don't include
|
184 |
-
the brackets!) The text should be enclosed in the appropriate
|
185 |
-
comment syntax for the file format. We also recommend that a
|
186 |
-
file or class name and description of purpose be included on the
|
187 |
-
same "printed page" as the copyright notice for easier
|
188 |
-
identification within third-party archives.
|
189 |
-
|
190 |
-
Copyright [yyyy] [name of copyright owner]
|
191 |
-
|
192 |
-
Licensed under the Apache License, Version 2.0 (the "License");
|
193 |
-
you may not use this file except in compliance with the License.
|
194 |
-
You may obtain a copy of the License at
|
195 |
-
|
196 |
-
http://www.apache.org/licenses/LICENSE-2.0
|
197 |
-
|
198 |
-
Unless required by applicable law or agreed to in writing, software
|
199 |
-
distributed under the License is distributed on an "AS IS" BASIS,
|
200 |
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
201 |
-
See the License for the specific language governing permissions and
|
202 |
-
limitations under the License.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llmperf/NOTICE.txt
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
[Project Name]
|
2 |
-
Copyright 2023-onwards Anyscale, Inc.
|
3 |
-
|
4 |
-
Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
-
you may not use this file except in compliance with the License.
|
6 |
-
You may obtain a copy of the License at
|
7 |
-
|
8 |
-
http://www.apache.org/licenses/LICENSE-2.0
|
9 |
-
|
10 |
-
Unless required by applicable law or agreed to in writing, software
|
11 |
-
distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
-
See the License for the specific language governing permissions and
|
14 |
-
limitations under the License.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llmperf/README.md
DELETED
@@ -1,415 +0,0 @@
|
|
1 |
-
# LLMPerf
|
2 |
-
|
3 |
-
A Tool for evaulation the performance of LLM APIs.
|
4 |
-
|
5 |
-
# Installation
|
6 |
-
```bash
|
7 |
-
git clone https://github.com/ray-project/llmperf.git
|
8 |
-
cd llmperf
|
9 |
-
pip install -e .
|
10 |
-
```
|
11 |
-
|
12 |
-
# Basic Usage
|
13 |
-
|
14 |
-
We implement 2 tests for evaluating LLMs: a load test to check for performance and a correctness test to check for correctness.
|
15 |
-
|
16 |
-
## Load test
|
17 |
-
|
18 |
-
The load test spawns a number of concurrent requests to the LLM API and measures the inter-token latency and generation throughput per request and across concurrent requests. The prompt that is sent with each request is of the format:
|
19 |
-
|
20 |
-
```
|
21 |
-
Randomly stream lines from the following text. Don't generate eos tokens:
|
22 |
-
LINE 1,
|
23 |
-
LINE 2,
|
24 |
-
LINE 3,
|
25 |
-
...
|
26 |
-
```
|
27 |
-
|
28 |
-
Where the lines are randomly sampled from a collection of lines from Shakespeare sonnets. Tokens are counted using the `LlamaTokenizer` regardless of which LLM API is being tested. This is to ensure that the prompts are consistent across different LLM APIs.
|
29 |
-
|
30 |
-
To run the most basic load test you can the token_benchmark_ray script.
|
31 |
-
|
32 |
-
|
33 |
-
### Caveats and Disclaimers
|
34 |
-
|
35 |
-
- The endpoints provider backend might vary widely, so this is not a reflection on how the software runs on a particular hardware.
|
36 |
-
- The results may vary with time of day.
|
37 |
-
- The results may vary with the load.
|
38 |
-
- The results may not correlate with users’ workloads.
|
39 |
-
|
40 |
-
### OpenAI Compatible APIs
|
41 |
-
```bash
|
42 |
-
export OPENAI_API_KEY=secret_abcdefg
|
43 |
-
export OPENAI_API_BASE="https://api.endpoints.anyscale.com/v1"
|
44 |
-
|
45 |
-
python token_benchmark_ray.py \
|
46 |
-
--model "meta-llama/Llama-2-7b-chat-hf" \
|
47 |
-
--mean-input-tokens 550 \
|
48 |
-
--stddev-input-tokens 150 \
|
49 |
-
--mean-output-tokens 150 \
|
50 |
-
--stddev-output-tokens 10 \
|
51 |
-
--max-num-completed-requests 2 \
|
52 |
-
--timeout 600 \
|
53 |
-
--num-concurrent-requests 1 \
|
54 |
-
--results-dir "result_outputs" \
|
55 |
-
--llm-api openai \
|
56 |
-
--additional-sampling-params '{}'
|
57 |
-
|
58 |
-
```
|
59 |
-
|
60 |
-
### Anthropic
|
61 |
-
```bash
|
62 |
-
export ANTHROPIC_API_KEY=secret_abcdefg
|
63 |
-
|
64 |
-
python token_benchmark_ray.py \
|
65 |
-
--model "claude-2" \
|
66 |
-
--mean-input-tokens 550 \
|
67 |
-
--stddev-input-tokens 150 \
|
68 |
-
--mean-output-tokens 150 \
|
69 |
-
--stddev-output-tokens 10 \
|
70 |
-
--max-num-completed-requests 2 \
|
71 |
-
--timeout 600 \
|
72 |
-
--num-concurrent-requests 1 \
|
73 |
-
--results-dir "result_outputs" \
|
74 |
-
--llm-api anthropic \
|
75 |
-
--additional-sampling-params '{}'
|
76 |
-
|
77 |
-
```
|
78 |
-
|
79 |
-
### TogetherAI
|
80 |
-
|
81 |
-
```bash
|
82 |
-
export TOGETHERAI_API_KEY="YOUR_TOGETHER_KEY"
|
83 |
-
|
84 |
-
python token_benchmark_ray.py \
|
85 |
-
--model "together_ai/togethercomputer/CodeLlama-7b-Instruct" \
|
86 |
-
--mean-input-tokens 550 \
|
87 |
-
--stddev-input-tokens 150 \
|
88 |
-
--mean-output-tokens 150 \
|
89 |
-
--stddev-output-tokens 10 \
|
90 |
-
--max-num-completed-requests 2 \
|
91 |
-
--timeout 600 \
|
92 |
-
--num-concurrent-requests 1 \
|
93 |
-
--results-dir "result_outputs" \
|
94 |
-
--llm-api "litellm" \
|
95 |
-
--additional-sampling-params '{}'
|
96 |
-
|
97 |
-
```
|
98 |
-
|
99 |
-
### Hugging Face
|
100 |
-
|
101 |
-
```bash
|
102 |
-
export HUGGINGFACE_API_KEY="YOUR_HUGGINGFACE_API_KEY"
|
103 |
-
export HUGGINGFACE_API_BASE="YOUR_HUGGINGFACE_API_ENDPOINT"
|
104 |
-
|
105 |
-
python token_benchmark_ray.py \
|
106 |
-
--model "huggingface/meta-llama/Llama-2-7b-chat-hf" \
|
107 |
-
--mean-input-tokens 550 \
|
108 |
-
--stddev-input-tokens 150 \
|
109 |
-
--mean-output-tokens 150 \
|
110 |
-
--stddev-output-tokens 10 \
|
111 |
-
--max-num-completed-requests 2 \
|
112 |
-
--timeout 600 \
|
113 |
-
--num-concurrent-requests 1 \
|
114 |
-
--results-dir "result_outputs" \
|
115 |
-
--llm-api "litellm" \
|
116 |
-
--additional-sampling-params '{}'
|
117 |
-
|
118 |
-
```
|
119 |
-
|
120 |
-
### LiteLLM
|
121 |
-
|
122 |
-
LLMPerf can use LiteLLM to send prompts to LLM APIs. To see the environment variables to set for the provider and arguments that one should set for model and additional-sampling-params.
|
123 |
-
|
124 |
-
see the [LiteLLM Provider Documentation](https://docs.litellm.ai/docs/providers).
|
125 |
-
|
126 |
-
```bash
|
127 |
-
python token_benchmark_ray.py \
|
128 |
-
--model "meta-llama/Llama-2-7b-chat-hf" \
|
129 |
-
--mean-input-tokens 550 \
|
130 |
-
--stddev-input-tokens 150 \
|
131 |
-
--mean-output-tokens 150 \
|
132 |
-
--stddev-output-tokens 10 \
|
133 |
-
--max-num-completed-requests 2 \
|
134 |
-
--timeout 600 \
|
135 |
-
--num-concurrent-requests 1 \
|
136 |
-
--results-dir "result_outputs" \
|
137 |
-
--llm-api "litellm" \
|
138 |
-
--additional-sampling-params '{}'
|
139 |
-
|
140 |
-
```
|
141 |
-
|
142 |
-
### Vertex AI
|
143 |
-
|
144 |
-
Here, --model is used for logging, not for selecting the model. The model is specified in the Vertex AI Endpoint ID.
|
145 |
-
|
146 |
-
The GCLOUD_ACCESS_TOKEN needs to be somewhat regularly set, as the token generated by `gcloud auth print-access-token` expires after 15 minutes or so.
|
147 |
-
|
148 |
-
Vertex AI doesn't return the total number of tokens that are generated by their endpoint, so tokens are counted using the LLama tokenizer.
|
149 |
-
|
150 |
-
```bash
|
151 |
-
|
152 |
-
gcloud auth application-default login
|
153 |
-
gcloud config set project YOUR_PROJECT_ID
|
154 |
-
|
155 |
-
export GCLOUD_ACCESS_TOKEN=$(gcloud auth print-access-token)
|
156 |
-
export GCLOUD_PROJECT_ID=YOUR_PROJECT_ID
|
157 |
-
export GCLOUD_REGION=YOUR_REGION
|
158 |
-
export VERTEXAI_ENDPOINT_ID=YOUR_ENDPOINT_ID
|
159 |
-
|
160 |
-
python token_benchmark_ray.py \
|
161 |
-
--model "meta-llama/Llama-2-7b-chat-hf" \
|
162 |
-
--mean-input-tokens 550 \
|
163 |
-
--stddev-input-tokens 150 \
|
164 |
-
--mean-output-tokens 150 \
|
165 |
-
--stddev-output-tokens 10 \
|
166 |
-
--max-num-completed-requests 2 \
|
167 |
-
--timeout 600 \
|
168 |
-
--num-concurrent-requests 1 \
|
169 |
-
--results-dir "result_outputs" \
|
170 |
-
--llm-api "vertexai" \
|
171 |
-
--additional-sampling-params '{}'
|
172 |
-
|
173 |
-
```
|
174 |
-
|
175 |
-
### SageMaker
|
176 |
-
|
177 |
-
SageMaker doesn't return the total number of tokens that are generated by their endpoint, so tokens are counted using the LLama tokenizer.
|
178 |
-
|
179 |
-
```bash
|
180 |
-
|
181 |
-
export AWS_ACCESS_KEY_ID="YOUR_ACCESS_KEY_ID"
|
182 |
-
export AWS_SECRET_ACCESS_KEY="YOUR_SECRET_ACCESS_KEY"s
|
183 |
-
export AWS_SESSION_TOKEN="YOUR_SESSION_TOKEN"
|
184 |
-
export AWS_REGION_NAME="YOUR_ENDPOINTS_REGION_NAME"
|
185 |
-
|
186 |
-
python llm_correctness.py \
|
187 |
-
--model "llama-2-7b" \
|
188 |
-
--llm-api "sagemaker" \
|
189 |
-
--max-num-completed-requests 2 \
|
190 |
-
--timeout 600 \
|
191 |
-
--num-concurrent-requests 1 \
|
192 |
-
--results-dir "result_outputs" \
|
193 |
-
|
194 |
-
```
|
195 |
-
|
196 |
-
see `python token_benchmark_ray.py --help` for more details on the arguments.
|
197 |
-
|
198 |
-
## Correctness Test
|
199 |
-
|
200 |
-
The correctness test spawns a number of concurrent requests to the LLM API with the following format:
|
201 |
-
|
202 |
-
```
|
203 |
-
Convert the following sequence of words into a number: {random_number_in_word_format}. Output just your final answer.
|
204 |
-
```
|
205 |
-
|
206 |
-
where random_number_in_word_format could be for example "one hundred and twenty three". The test then checks that the response contains that number in digit format which in this case would be 123.
|
207 |
-
|
208 |
-
The test does this for a number of randomly generated numbers and reports the number of responses that contain a mismatch.
|
209 |
-
|
210 |
-
To run the most basic correctness test you can run the the llm_correctness.py script.
|
211 |
-
|
212 |
-
### OpenAI Compatible APIs
|
213 |
-
|
214 |
-
```bash
|
215 |
-
export OPENAI_API_KEY=secret_abcdefg
|
216 |
-
export OPENAI_API_BASE=https://console.endpoints.anyscale.com/m/v1
|
217 |
-
|
218 |
-
python llm_correctness.py \
|
219 |
-
--model "meta-llama/Llama-2-7b-chat-hf" \
|
220 |
-
--max-num-completed-requests 150 \
|
221 |
-
--timeout 600 \
|
222 |
-
--num-concurrent-requests 10 \
|
223 |
-
--results-dir "result_outputs"
|
224 |
-
```
|
225 |
-
|
226 |
-
### Anthropic
|
227 |
-
|
228 |
-
```bash
|
229 |
-
export ANTHROPIC_API_KEY=secret_abcdefg
|
230 |
-
|
231 |
-
python llm_correctness.py \
|
232 |
-
--model "claude-2" \
|
233 |
-
--llm-api "anthropic" \
|
234 |
-
--max-num-completed-requests 5 \
|
235 |
-
--timeout 600 \
|
236 |
-
--num-concurrent-requests 1 \
|
237 |
-
--results-dir "result_outputs"
|
238 |
-
```
|
239 |
-
|
240 |
-
### TogetherAI
|
241 |
-
|
242 |
-
```bash
|
243 |
-
export TOGETHERAI_API_KEY="YOUR_TOGETHER_KEY"
|
244 |
-
|
245 |
-
python llm_correctness.py \
|
246 |
-
--model "together_ai/togethercomputer/CodeLlama-7b-Instruct" \
|
247 |
-
--llm-api "litellm" \
|
248 |
-
--max-num-completed-requests 2 \
|
249 |
-
--timeout 600 \
|
250 |
-
--num-concurrent-requests 1 \
|
251 |
-
--results-dir "result_outputs" \
|
252 |
-
|
253 |
-
```
|
254 |
-
|
255 |
-
### Hugging Face
|
256 |
-
|
257 |
-
```bash
|
258 |
-
export HUGGINGFACE_API_KEY="YOUR_HUGGINGFACE_API_KEY"
|
259 |
-
export HUGGINGFACE_API_BASE="YOUR_HUGGINGFACE_API_ENDPOINT"
|
260 |
-
|
261 |
-
python llm_correctness.py \
|
262 |
-
--model "huggingface/meta-llama/Llama-2-7b-chat-hf" \
|
263 |
-
--llm-api "litellm" \
|
264 |
-
--max-num-completed-requests 2 \
|
265 |
-
--timeout 600 \
|
266 |
-
--num-concurrent-requests 1 \
|
267 |
-
--results-dir "result_outputs" \
|
268 |
-
|
269 |
-
```
|
270 |
-
|
271 |
-
### LiteLLM
|
272 |
-
|
273 |
-
LLMPerf can use LiteLLM to send prompts to LLM APIs. To see the environment variables to set for the provider and arguments that one should set for model and additional-sampling-params.
|
274 |
-
|
275 |
-
see the [LiteLLM Provider Documentation](https://docs.litellm.ai/docs/providers).
|
276 |
-
|
277 |
-
```bash
|
278 |
-
python llm_correctness.py \
|
279 |
-
--model "meta-llama/Llama-2-7b-chat-hf" \
|
280 |
-
--llm-api "litellm" \
|
281 |
-
--max-num-completed-requests 2 \
|
282 |
-
--timeout 600 \
|
283 |
-
--num-concurrent-requests 1 \
|
284 |
-
--results-dir "result_outputs" \
|
285 |
-
|
286 |
-
```
|
287 |
-
|
288 |
-
see `python llm_correctness.py --help` for more details on the arguments.
|
289 |
-
|
290 |
-
|
291 |
-
### Vertex AI
|
292 |
-
|
293 |
-
Here, --model is used for logging, not for selecting the model. The model is specified in the Vertex AI Endpoint ID.
|
294 |
-
|
295 |
-
The GCLOUD_ACCESS_TOKEN needs to be somewhat regularly set, as the token generated by `gcloud auth print-access-token` expires after 15 minutes or so.
|
296 |
-
|
297 |
-
Vertex AI doesn't return the total number of tokens that are generated by their endpoint, so tokens are counted using the LLama tokenizer.
|
298 |
-
|
299 |
-
|
300 |
-
```bash
|
301 |
-
|
302 |
-
gcloud auth application-default login
|
303 |
-
gcloud config set project YOUR_PROJECT_ID
|
304 |
-
|
305 |
-
export GCLOUD_ACCESS_TOKEN=$(gcloud auth print-access-token)
|
306 |
-
export GCLOUD_PROJECT_ID=YOUR_PROJECT_ID
|
307 |
-
export GCLOUD_REGION=YOUR_REGION
|
308 |
-
export VERTEXAI_ENDPOINT_ID=YOUR_ENDPOINT_ID
|
309 |
-
|
310 |
-
python llm_correctness.py \
|
311 |
-
--model "meta-llama/Llama-2-7b-chat-hf" \
|
312 |
-
--llm-api "vertexai" \
|
313 |
-
--max-num-completed-requests 2 \
|
314 |
-
--timeout 600 \
|
315 |
-
--num-concurrent-requests 1 \
|
316 |
-
--results-dir "result_outputs" \
|
317 |
-
|
318 |
-
```
|
319 |
-
|
320 |
-
### SageMaker
|
321 |
-
|
322 |
-
SageMaker doesn't return the total number of tokens that are generated by their endpoint, so tokens are counted using the LLama tokenizer.
|
323 |
-
|
324 |
-
```bash
|
325 |
-
|
326 |
-
export AWS_ACCESS_KEY_ID="YOUR_ACCESS_KEY_ID"
|
327 |
-
export AWS_SECRET_ACCESS_KEY="YOUR_SECRET_ACCESS_KEY"s
|
328 |
-
export AWS_SESSION_TOKEN="YOUR_SESSION_TOKEN"
|
329 |
-
export AWS_REGION_NAME="YOUR_ENDPOINTS_REGION_NAME"
|
330 |
-
|
331 |
-
python llm_correctness.py \
|
332 |
-
--model "llama-2-7b" \
|
333 |
-
--llm-api "sagemaker" \
|
334 |
-
--max-num-completed-requests 2 \
|
335 |
-
--timeout 600 \
|
336 |
-
--num-concurrent-requests 1 \
|
337 |
-
--results-dir "result_outputs" \
|
338 |
-
|
339 |
-
```
|
340 |
-
|
341 |
-
## Saving Results
|
342 |
-
|
343 |
-
The results of the load test and correctness test are saved in the results directory specified by the `--results-dir` argument. The results are saved in 2 files, one with the summary metrics of the test, and one with metrics from each individual request that is returned.
|
344 |
-
|
345 |
-
# Advanced Usage
|
346 |
-
|
347 |
-
The correctness tests were implemented with the following workflow in mind:
|
348 |
-
|
349 |
-
```python
|
350 |
-
import ray
|
351 |
-
from transformers import LlamaTokenizerFast
|
352 |
-
|
353 |
-
from llmperf.ray_clients.openai_chat_completions_client import (
|
354 |
-
OpenAIChatCompletionsClient,
|
355 |
-
)
|
356 |
-
from llmperf.models import RequestConfig
|
357 |
-
from llmperf.requests_launcher import RequestsLauncher
|
358 |
-
|
359 |
-
|
360 |
-
# Copying the environment variables and passing them to ray.init() is necessary
|
361 |
-
# For making any clients work.
|
362 |
-
ray.init(runtime_env={"env_vars": {"OPENAI_API_BASE" : "https://api.endpoints.anyscale.com/v1",
|
363 |
-
"OPENAI_API_KEY" : "YOUR_API_KEY"}})
|
364 |
-
|
365 |
-
base_prompt = "hello_world"
|
366 |
-
tokenizer = LlamaTokenizerFast.from_pretrained(
|
367 |
-
"hf-internal-testing/llama-tokenizer"
|
368 |
-
)
|
369 |
-
base_prompt_len = len(tokenizer.encode(base_prompt))
|
370 |
-
prompt = (base_prompt, base_prompt_len)
|
371 |
-
|
372 |
-
# Create a client for spawning requests
|
373 |
-
clients = [OpenAIChatCompletionsClient.remote()]
|
374 |
-
|
375 |
-
req_launcher = RequestsLauncher(clients)
|
376 |
-
|
377 |
-
req_config = RequestConfig(
|
378 |
-
model="meta-llama/Llama-2-7b-chat-hf",
|
379 |
-
prompt=prompt
|
380 |
-
)
|
381 |
-
|
382 |
-
req_launcher.launch_requests(req_config)
|
383 |
-
result = req_launcher.get_next_ready(block=True)
|
384 |
-
print(result)
|
385 |
-
|
386 |
-
```
|
387 |
-
|
388 |
-
# Implementing New LLM Clients
|
389 |
-
|
390 |
-
To implement a new LLM client, you need to implement the base class `llmperf.ray_llm_client.LLMClient` and decorate it as a ray actor.
|
391 |
-
|
392 |
-
```python
|
393 |
-
|
394 |
-
from llmperf.ray_llm_client import LLMClient
|
395 |
-
import ray
|
396 |
-
|
397 |
-
|
398 |
-
@ray.remote
|
399 |
-
class CustomLLMClient(LLMClient):
|
400 |
-
|
401 |
-
def llm_request(self, request_config: RequestConfig) -> Tuple[Metrics, str, RequestConfig]:
|
402 |
-
"""Make a single completion request to a LLM API
|
403 |
-
|
404 |
-
Returns:
|
405 |
-
Metrics about the performance charateristics of the request.
|
406 |
-
The text generated by the request to the LLM API.
|
407 |
-
The request_config used to make the request. This is mainly for logging purposes.
|
408 |
-
|
409 |
-
"""
|
410 |
-
...
|
411 |
-
|
412 |
-
```
|
413 |
-
|
414 |
-
# Legacy Codebase
|
415 |
-
The old LLMPerf code base can be found in the [llmperf-legacy](https://github.com/ray-project/llmval-legacy) repo.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llmperf/analyze-token-benchmark-results.ipynb
DELETED
@@ -1,327 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"cells": [
|
3 |
-
{
|
4 |
-
"cell_type": "markdown",
|
5 |
-
"id": "56950450",
|
6 |
-
"metadata": {},
|
7 |
-
"source": [
|
8 |
-
"# Token Benchmark Example Analysis\n",
|
9 |
-
"The following is an example of the analysis that can be done on individual responses that are saved when running `token_benchmark_ray.py` with the flag `--results-dir` which enables the saving of all responses."
|
10 |
-
]
|
11 |
-
},
|
12 |
-
{
|
13 |
-
"cell_type": "code",
|
14 |
-
"execution_count": 1,
|
15 |
-
"id": "dacfe98a-e81b-4089-9506-97a652993b5b",
|
16 |
-
"metadata": {
|
17 |
-
"tags": []
|
18 |
-
},
|
19 |
-
"outputs": [],
|
20 |
-
"source": [
|
21 |
-
"import pandas as pd"
|
22 |
-
]
|
23 |
-
},
|
24 |
-
{
|
25 |
-
"cell_type": "code",
|
26 |
-
"execution_count": 6,
|
27 |
-
"id": "17f7abe9-ed9e-466c-b034-577489aaf98b",
|
28 |
-
"metadata": {
|
29 |
-
"tags": []
|
30 |
-
},
|
31 |
-
"outputs": [
|
32 |
-
{
|
33 |
-
"data": {
|
34 |
-
"text/html": [
|
35 |
-
"<div>\n",
|
36 |
-
"<style scoped>\n",
|
37 |
-
" .dataframe tbody tr th:only-of-type {\n",
|
38 |
-
" vertical-align: middle;\n",
|
39 |
-
" }\n",
|
40 |
-
"\n",
|
41 |
-
" .dataframe tbody tr th {\n",
|
42 |
-
" vertical-align: top;\n",
|
43 |
-
" }\n",
|
44 |
-
"\n",
|
45 |
-
" .dataframe thead th {\n",
|
46 |
-
" text-align: right;\n",
|
47 |
-
" }\n",
|
48 |
-
"</style>\n",
|
49 |
-
"<table border=\"1\" class=\"dataframe\">\n",
|
50 |
-
" <thead>\n",
|
51 |
-
" <tr style=\"text-align: right;\">\n",
|
52 |
-
" <th></th>\n",
|
53 |
-
" <th>error_code</th>\n",
|
54 |
-
" <th>error_msg</th>\n",
|
55 |
-
" <th>inter_token_latency_s</th>\n",
|
56 |
-
" <th>ttft_s</th>\n",
|
57 |
-
" <th>end_to_end_latency_s</th>\n",
|
58 |
-
" <th>request_output_throughput_token_per_s</th>\n",
|
59 |
-
" <th>number_total_tokens</th>\n",
|
60 |
-
" <th>number_output_tokens</th>\n",
|
61 |
-
" <th>number_input_tokens</th>\n",
|
62 |
-
" </tr>\n",
|
63 |
-
" </thead>\n",
|
64 |
-
" <tbody>\n",
|
65 |
-
" <tr>\n",
|
66 |
-
" <th>0</th>\n",
|
67 |
-
" <td>NaN</td>\n",
|
68 |
-
" <td></td>\n",
|
69 |
-
" <td>[0.5549881670012831, 0.0009654169989510001, 0....</td>\n",
|
70 |
-
" <td>0.554988</td>\n",
|
71 |
-
" <td>1.610734</td>\n",
|
72 |
-
" <td>44.079272</td>\n",
|
73 |
-
" <td>706</td>\n",
|
74 |
-
" <td>71</td>\n",
|
75 |
-
" <td>635</td>\n",
|
76 |
-
" </tr>\n",
|
77 |
-
" <tr>\n",
|
78 |
-
" <th>1</th>\n",
|
79 |
-
" <td>NaN</td>\n",
|
80 |
-
" <td></td>\n",
|
81 |
-
" <td>[0.6019128750049271, 0.007011749999946, 0.0144...</td>\n",
|
82 |
-
" <td>0.601913</td>\n",
|
83 |
-
" <td>1.725729</td>\n",
|
84 |
-
" <td>44.039357</td>\n",
|
85 |
-
" <td>730</td>\n",
|
86 |
-
" <td>76</td>\n",
|
87 |
-
" <td>654</td>\n",
|
88 |
-
" </tr>\n",
|
89 |
-
" </tbody>\n",
|
90 |
-
"</table>\n",
|
91 |
-
"</div>"
|
92 |
-
],
|
93 |
-
"text/plain": [
|
94 |
-
" error_code error_msg inter_token_latency_s \\\n",
|
95 |
-
"0 NaN [0.5549881670012831, 0.0009654169989510001, 0.... \n",
|
96 |
-
"1 NaN [0.6019128750049271, 0.007011749999946, 0.0144... \n",
|
97 |
-
"\n",
|
98 |
-
" ttft_s end_to_end_latency_s request_output_throughput_token_per_s \\\n",
|
99 |
-
"0 0.554988 1.610734 44.079272 \n",
|
100 |
-
"1 0.601913 1.725729 44.039357 \n",
|
101 |
-
"\n",
|
102 |
-
" number_total_tokens number_output_tokens number_input_tokens \n",
|
103 |
-
"0 706 71 635 \n",
|
104 |
-
"1 730 76 654 "
|
105 |
-
]
|
106 |
-
},
|
107 |
-
"execution_count": 6,
|
108 |
-
"metadata": {},
|
109 |
-
"output_type": "execute_result"
|
110 |
-
}
|
111 |
-
],
|
112 |
-
"source": [
|
113 |
-
"# path to the individual responses json file\n",
|
114 |
-
"df = pd.read_json('/home/ray/default/llmperf/result_outputs/550_150_individual_responses.json')\n"
|
115 |
-
]
|
116 |
-
},
|
117 |
-
{
|
118 |
-
"cell_type": "code",
|
119 |
-
"execution_count": 12,
|
120 |
-
"id": "565a59e4",
|
121 |
-
"metadata": {},
|
122 |
-
"outputs": [],
|
123 |
-
"source": [
|
124 |
-
"valid_df = df[(df[\"error_code\"] != \"\")]"
|
125 |
-
]
|
126 |
-
},
|
127 |
-
{
|
128 |
-
"cell_type": "code",
|
129 |
-
"execution_count": 13,
|
130 |
-
"id": "102894bc",
|
131 |
-
"metadata": {},
|
132 |
-
"outputs": [
|
133 |
-
{
|
134 |
-
"data": {
|
135 |
-
"text/html": [
|
136 |
-
"<div>\n",
|
137 |
-
"<style scoped>\n",
|
138 |
-
" .dataframe tbody tr th:only-of-type {\n",
|
139 |
-
" vertical-align: middle;\n",
|
140 |
-
" }\n",
|
141 |
-
"\n",
|
142 |
-
" .dataframe tbody tr th {\n",
|
143 |
-
" vertical-align: top;\n",
|
144 |
-
" }\n",
|
145 |
-
"\n",
|
146 |
-
" .dataframe thead th {\n",
|
147 |
-
" text-align: right;\n",
|
148 |
-
" }\n",
|
149 |
-
"</style>\n",
|
150 |
-
"<table border=\"1\" class=\"dataframe\">\n",
|
151 |
-
" <thead>\n",
|
152 |
-
" <tr style=\"text-align: right;\">\n",
|
153 |
-
" <th></th>\n",
|
154 |
-
" <th>error_code</th>\n",
|
155 |
-
" <th>error_msg</th>\n",
|
156 |
-
" <th>inter_token_latency_s</th>\n",
|
157 |
-
" <th>ttft_s</th>\n",
|
158 |
-
" <th>end_to_end_latency_s</th>\n",
|
159 |
-
" <th>request_output_throughput_token_per_s</th>\n",
|
160 |
-
" <th>number_total_tokens</th>\n",
|
161 |
-
" <th>number_output_tokens</th>\n",
|
162 |
-
" <th>number_input_tokens</th>\n",
|
163 |
-
" </tr>\n",
|
164 |
-
" </thead>\n",
|
165 |
-
" <tbody>\n",
|
166 |
-
" <tr>\n",
|
167 |
-
" <th>0</th>\n",
|
168 |
-
" <td>NaN</td>\n",
|
169 |
-
" <td></td>\n",
|
170 |
-
" <td>[0.5549881670012831, 0.0009654169989510001, 0....</td>\n",
|
171 |
-
" <td>0.554988</td>\n",
|
172 |
-
" <td>1.610734</td>\n",
|
173 |
-
" <td>44.079272</td>\n",
|
174 |
-
" <td>706</td>\n",
|
175 |
-
" <td>71</td>\n",
|
176 |
-
" <td>635</td>\n",
|
177 |
-
" </tr>\n",
|
178 |
-
" <tr>\n",
|
179 |
-
" <th>1</th>\n",
|
180 |
-
" <td>NaN</td>\n",
|
181 |
-
" <td></td>\n",
|
182 |
-
" <td>[0.6019128750049271, 0.007011749999946, 0.0144...</td>\n",
|
183 |
-
" <td>0.601913</td>\n",
|
184 |
-
" <td>1.725729</td>\n",
|
185 |
-
" <td>44.039357</td>\n",
|
186 |
-
" <td>730</td>\n",
|
187 |
-
" <td>76</td>\n",
|
188 |
-
" <td>654</td>\n",
|
189 |
-
" </tr>\n",
|
190 |
-
" </tbody>\n",
|
191 |
-
"</table>\n",
|
192 |
-
"</div>"
|
193 |
-
],
|
194 |
-
"text/plain": [
|
195 |
-
" error_code error_msg inter_token_latency_s \\\n",
|
196 |
-
"0 NaN [0.5549881670012831, 0.0009654169989510001, 0.... \n",
|
197 |
-
"1 NaN [0.6019128750049271, 0.007011749999946, 0.0144... \n",
|
198 |
-
"\n",
|
199 |
-
" ttft_s end_to_end_latency_s request_output_throughput_token_per_s \\\n",
|
200 |
-
"0 0.554988 1.610734 44.079272 \n",
|
201 |
-
"1 0.601913 1.725729 44.039357 \n",
|
202 |
-
"\n",
|
203 |
-
" number_total_tokens number_output_tokens number_input_tokens \n",
|
204 |
-
"0 706 71 635 \n",
|
205 |
-
"1 730 76 654 "
|
206 |
-
]
|
207 |
-
},
|
208 |
-
"execution_count": 13,
|
209 |
-
"metadata": {},
|
210 |
-
"output_type": "execute_result"
|
211 |
-
}
|
212 |
-
],
|
213 |
-
"source": [
|
214 |
-
"valid_df"
|
215 |
-
]
|
216 |
-
},
|
217 |
-
{
|
218 |
-
"cell_type": "code",
|
219 |
-
"execution_count": 14,
|
220 |
-
"id": "c7519fc9",
|
221 |
-
"metadata": {},
|
222 |
-
"outputs": [
|
223 |
-
{
|
224 |
-
"name": "stdout",
|
225 |
-
"output_type": "stream",
|
226 |
-
"text": [
|
227 |
-
"Mean number of input tokens: 644.5. Mean number of output tokens: 73.5\n"
|
228 |
-
]
|
229 |
-
},
|
230 |
-
{
|
231 |
-
"data": {
|
232 |
-
"text/plain": [
|
233 |
-
"<Axes: title={'center': 'Number of Input Tokens vs. TTFT'}, xlabel='number_input_tokens', ylabel='ttft_s'>"
|
234 |
-
]
|
235 |
-
},
|
236 |
-
"execution_count": 14,
|
237 |
-
"metadata": {},
|
238 |
-
"output_type": "execute_result"
|
239 |
-
},
|
240 |
-
{
|
241 |
-
"data": {
|
242 |
-
"image/png": "",
|
243 |
-
"text/plain": [
|
244 |
-
"<Figure size 640x480 with 1 Axes>"
|
245 |
-
]
|
246 |
-
},
|
247 |
-
"metadata": {},
|
248 |
-
"output_type": "display_data"
|
249 |
-
}
|
250 |
-
],
|
251 |
-
"source": [
|
252 |
-
"final_df = pd.DataFrame()\n",
|
253 |
-
"final_df[\"number_input_tokens\"] = valid_df[\"number_input_tokens\"]\n",
|
254 |
-
"final_df[\"number_output_tokens\"] = valid_df[\"number_output_tokens\"]\n",
|
255 |
-
"final_df[\"ttft_s\"] = valid_df[\"ttft_s\"]\n",
|
256 |
-
"final_df[\"end_to_end_latency_s\"] = valid_df[\"end_to_end_latency_s\"]\n",
|
257 |
-
"final_df[\"generation_throughput\"] = valid_df[\"request_output_throughput_token_per_s\"]\n",
|
258 |
-
"\n",
|
259 |
-
"mean_tokens_in = final_df[\"number_input_tokens\"].mean()\n",
|
260 |
-
"mean_tokens_out = valid_df[\"number_output_tokens\"].mean()\n",
|
261 |
-
"print(f\"Mean number of input tokens: {mean_tokens_in}. Mean number of output tokens: {mean_tokens_out}\")\n",
|
262 |
-
"final_df.plot.scatter(x=\"number_input_tokens\", y=\"ttft_s\", title=\"Number of Input Tokens vs. TTFT\")"
|
263 |
-
]
|
264 |
-
},
|
265 |
-
{
|
266 |
-
"cell_type": "code",
|
267 |
-
"execution_count": 15,
|
268 |
-
"id": "a14de79c",
|
269 |
-
"metadata": {},
|
270 |
-
"outputs": [
|
271 |
-
{
|
272 |
-
"data": {
|
273 |
-
"text/plain": [
|
274 |
-
"<Axes: title={'center': 'Token Latencies'}, ylabel='Frequency'>"
|
275 |
-
]
|
276 |
-
},
|
277 |
-
"execution_count": 15,
|
278 |
-
"metadata": {},
|
279 |
-
"output_type": "execute_result"
|
280 |
-
},
|
281 |
-
{
|
282 |
-
"data": {
|
283 |
-
"image/png": "",
|
284 |
-
"text/plain": [
|
285 |
-
"<Figure size 640x480 with 1 Axes>"
|
286 |
-
]
|
287 |
-
},
|
288 |
-
"metadata": {},
|
289 |
-
"output_type": "display_data"
|
290 |
-
}
|
291 |
-
],
|
292 |
-
"source": [
|
293 |
-
"all_token_latencies = valid_df['end_to_end_latency_s'].apply(pd.Series).stack()\n",
|
294 |
-
"all_token_latencies = all_token_latencies.reset_index(drop=True)\n",
|
295 |
-
"all_token_latencies.plot.hist(title=\"Token Latencies\")\n"
|
296 |
-
]
|
297 |
-
},
|
298 |
-
{
|
299 |
-
"cell_type": "code",
|
300 |
-
"execution_count": null,
|
301 |
-
"metadata": {},
|
302 |
-
"outputs": [],
|
303 |
-
"source": []
|
304 |
-
}
|
305 |
-
],
|
306 |
-
"metadata": {
|
307 |
-
"kernelspec": {
|
308 |
-
"display_name": "Python 3 (ipykernel)",
|
309 |
-
"language": "python",
|
310 |
-
"name": "python3"
|
311 |
-
},
|
312 |
-
"language_info": {
|
313 |
-
"codemirror_mode": {
|
314 |
-
"name": "ipython",
|
315 |
-
"version": 3
|
316 |
-
},
|
317 |
-
"file_extension": ".py",
|
318 |
-
"mimetype": "text/x-python",
|
319 |
-
"name": "python",
|
320 |
-
"nbconvert_exporter": "python",
|
321 |
-
"pygments_lexer": "ipython3",
|
322 |
-
"version": "3.10.13"
|
323 |
-
}
|
324 |
-
},
|
325 |
-
"nbformat": 4,
|
326 |
-
"nbformat_minor": 5
|
327 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llmperf/llm_correctness.py
DELETED
@@ -1,309 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
import json
|
3 |
-
import os
|
4 |
-
from pathlib import Path
|
5 |
-
import random
|
6 |
-
import re
|
7 |
-
import time
|
8 |
-
from typing import Any, Dict, List, Optional, Tuple
|
9 |
-
|
10 |
-
import num2words
|
11 |
-
import ray
|
12 |
-
from tqdm import tqdm
|
13 |
-
|
14 |
-
from llmperf import common_metrics
|
15 |
-
from llmperf.common import SUPPORTED_APIS, construct_clients
|
16 |
-
from llmperf.models import RequestConfig
|
17 |
-
from llmperf.requests_launcher import RequestsLauncher
|
18 |
-
from llmperf.utils import (
|
19 |
-
LLMPerfResults,
|
20 |
-
)
|
21 |
-
|
22 |
-
MAX_RANDOM_NUMBER = 10000
|
23 |
-
|
24 |
-
|
25 |
-
def llm_correctness(
|
26 |
-
model: str,
|
27 |
-
additional_sampling_params: Optional[Dict[str, Any]] = None,
|
28 |
-
num_concurrent_requests: int = 1,
|
29 |
-
max_num_completed_requests: int = 500,
|
30 |
-
test_timeout_s=90,
|
31 |
-
llm_api="chat",
|
32 |
-
) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
|
33 |
-
"""Get the token throughput and latencies for the given model.
|
34 |
-
|
35 |
-
Args:
|
36 |
-
model: The name of the model to query.
|
37 |
-
additional_sampling_params: Additional sampling parameters to send with the request.
|
38 |
-
For more information see the LLM APIs documentation for the completions
|
39 |
-
num_concurrent_requests: The number of concurrent requests to make. Increase
|
40 |
-
this to increase the amount of load and vice versa.
|
41 |
-
test_timeout_s: The amount of time to run the test for before reporting results.
|
42 |
-
llm_api: The type of request to make. Either "chat" or "litellm".
|
43 |
-
|
44 |
-
Returns:
|
45 |
-
A tuple containing summary metrics and raw results from the test.
|
46 |
-
|
47 |
-
"""
|
48 |
-
|
49 |
-
if not additional_sampling_params:
|
50 |
-
additional_sampling_params = {}
|
51 |
-
|
52 |
-
clients = construct_clients(llm_api=llm_api, num_clients=num_concurrent_requests)
|
53 |
-
req_launcher = RequestsLauncher(clients)
|
54 |
-
start_time = time.monotonic()
|
55 |
-
|
56 |
-
num_errored_requests = 0
|
57 |
-
num_mismatched_requests = 0
|
58 |
-
num_completed_requests = 0
|
59 |
-
|
60 |
-
sampling_params = {"temperature": 0.0}
|
61 |
-
sampling_params.update(additional_sampling_params)
|
62 |
-
completed_requests = []
|
63 |
-
iter = 0
|
64 |
-
pbar = tqdm(total=max_num_completed_requests)
|
65 |
-
while (
|
66 |
-
time.monotonic() - start_time < test_timeout_s
|
67 |
-
and num_completed_requests < max_num_completed_requests
|
68 |
-
):
|
69 |
-
iter += 1
|
70 |
-
rnd_number = random.randint(0, MAX_RANDOM_NUMBER)
|
71 |
-
rnd_num_words = num2words.num2words(rnd_number)
|
72 |
-
|
73 |
-
prompt = f"Convert the following sequence of words into a number: {rnd_num_words}.\nPrint the number first."
|
74 |
-
|
75 |
-
request_config = RequestConfig(
|
76 |
-
model=model,
|
77 |
-
prompt=(prompt, 0),
|
78 |
-
sampling_params=sampling_params,
|
79 |
-
metadata={"rnd_number": rnd_number},
|
80 |
-
llm_api=llm_api,
|
81 |
-
)
|
82 |
-
req_launcher.launch_requests(request_config)
|
83 |
-
|
84 |
-
if not (iter % num_concurrent_requests):
|
85 |
-
completed_requests.extend(req_launcher.get_next_ready())
|
86 |
-
pbar.update(len(completed_requests) - num_completed_requests)
|
87 |
-
num_completed_requests = len(completed_requests)
|
88 |
-
|
89 |
-
pbar.close()
|
90 |
-
end_time = time.monotonic()
|
91 |
-
if end_time - start_time >= test_timeout_s:
|
92 |
-
print("Test timed out before all requests could be completed.")
|
93 |
-
|
94 |
-
raw_results = []
|
95 |
-
|
96 |
-
print("Mismatched and errored requests.")
|
97 |
-
for out in completed_requests:
|
98 |
-
metrics, generated_text, completed_request_config = out
|
99 |
-
|
100 |
-
raw_results.append(
|
101 |
-
{
|
102 |
-
"metrics": metrics,
|
103 |
-
"generated_text": generated_text,
|
104 |
-
"request_config": dict(completed_request_config),
|
105 |
-
}
|
106 |
-
)
|
107 |
-
|
108 |
-
# if there were no errors when making request.
|
109 |
-
if not metrics[common_metrics.ERROR_CODE]:
|
110 |
-
try:
|
111 |
-
commas_between_numbers_re = r"(\d+),(?=\d)"
|
112 |
-
gen_text_commas_removed = re.sub(
|
113 |
-
commas_between_numbers_re, r"\1", generated_text
|
114 |
-
)
|
115 |
-
nums = re.findall(r"\d+", gen_text_commas_removed)
|
116 |
-
generated_text = gen_text_commas_removed.replace("\n", " ")
|
117 |
-
|
118 |
-
assert str(completed_request_config.metadata["rnd_number"]) in nums
|
119 |
-
except:
|
120 |
-
num_mismatched_requests += 1
|
121 |
-
print(
|
122 |
-
f" mismatched request: {generated_text}, expected: {completed_request_config.metadata['rnd_number']}"
|
123 |
-
)
|
124 |
-
else:
|
125 |
-
num_errored_requests += 1
|
126 |
-
print(
|
127 |
-
f" The request errored: {metrics[common_metrics.ERROR_CODE]}, "
|
128 |
-
f"{metrics[common_metrics.ERROR_MSG]} "
|
129 |
-
)
|
130 |
-
print()
|
131 |
-
|
132 |
-
error_rate = num_errored_requests / num_completed_requests
|
133 |
-
mismatch_rate = num_mismatched_requests / num_completed_requests
|
134 |
-
num_non_errored_requests = num_completed_requests - num_errored_requests
|
135 |
-
summary_metrics = {}
|
136 |
-
summary_metrics[common_metrics.NUM_ERRORS] = num_errored_requests
|
137 |
-
summary_metrics["num_mismatched_requests"] = num_mismatched_requests
|
138 |
-
summary_metrics["error_rate"] = error_rate
|
139 |
-
summary_metrics["mismatch_rate"] = mismatch_rate
|
140 |
-
summary_metrics[common_metrics.NUM_COMPLETED_REQUESTS] = num_completed_requests
|
141 |
-
summary_metrics["num_non_errored_requests"] = num_non_errored_requests
|
142 |
-
|
143 |
-
# Metadata
|
144 |
-
summary_metrics["model"] = model
|
145 |
-
summary_metrics["num_concurrent_requests"] = num_concurrent_requests
|
146 |
-
summary_metrics["additional_sampling_params"] = additional_sampling_params
|
147 |
-
summary_metrics["llm_api"] = llm_api
|
148 |
-
|
149 |
-
return summary_metrics, raw_results
|
150 |
-
|
151 |
-
|
152 |
-
def run(
|
153 |
-
llm_api: str,
|
154 |
-
model: str,
|
155 |
-
test_timeout_s: int,
|
156 |
-
max_num_completed_requests: int,
|
157 |
-
num_concurrent_requests: int,
|
158 |
-
additional_sampling_params: str,
|
159 |
-
results_dir: str,
|
160 |
-
user_metadata: Dict[str, str],
|
161 |
-
):
|
162 |
-
"""
|
163 |
-
Args:
|
164 |
-
llm_api: The type of request to make. Either "chat" or "litellm".
|
165 |
-
model: The name of the model to query.
|
166 |
-
max_num_completed_requests: The number of requests to complete before finishing the test.
|
167 |
-
test_timeout_s: The amount of time to run the test for before reporting results.
|
168 |
-
num_concurrent_requests: The number of concurrent requests to make. Increase
|
169 |
-
this to increase the amount of load and vice versa.
|
170 |
-
mean_input_tokens: The mean number of tokens to send in the prompt for the request.
|
171 |
-
stddev_input_tokens: The standard deviation of the number of tokens to send in the prompt for the request.
|
172 |
-
mean_output_tokens: The mean number of tokens to generate per request.
|
173 |
-
stddev_output_tokens: The standard deviation of the number of tokens to generate per request.
|
174 |
-
additional_sampling_params: Additional sampling parameters to send with the request.
|
175 |
-
For more information see the LLM APIs documentation for the completions.
|
176 |
-
results_dir: The directory to save the results to.
|
177 |
-
|
178 |
-
"""
|
179 |
-
|
180 |
-
summary_metrics, raw_results = llm_correctness(
|
181 |
-
model=model,
|
182 |
-
llm_api=llm_api,
|
183 |
-
test_timeout_s=test_timeout_s,
|
184 |
-
max_num_completed_requests=max_num_completed_requests,
|
185 |
-
num_concurrent_requests=num_concurrent_requests,
|
186 |
-
additional_sampling_params=json.loads(additional_sampling_params),
|
187 |
-
)
|
188 |
-
|
189 |
-
time.sleep(2)
|
190 |
-
|
191 |
-
print(
|
192 |
-
f"Results for llm correctness test for {model} queried with the {llm_api} api."
|
193 |
-
)
|
194 |
-
print(
|
195 |
-
f"Errors: {summary_metrics[common_metrics.NUM_ERRORS]}, "
|
196 |
-
f"Error rate: {summary_metrics['error_rate']}"
|
197 |
-
)
|
198 |
-
|
199 |
-
print(
|
200 |
-
f"Mismatched: {summary_metrics['num_mismatched_requests']}, "
|
201 |
-
f"Mismatch rate: {summary_metrics['mismatch_rate']}"
|
202 |
-
)
|
203 |
-
print(f"Completed: {summary_metrics[common_metrics.NUM_COMPLETED_REQUESTS]}")
|
204 |
-
print(f"Completed without errors: {summary_metrics['num_non_errored_requests']}")
|
205 |
-
|
206 |
-
if results_dir:
|
207 |
-
file_name = f"{model}_correctness"
|
208 |
-
file_name = re.sub(r"[^\w\d-]+", "-", file_name)
|
209 |
-
file_name = re.sub(r"-{2,}", "-", file_name)
|
210 |
-
summary_file_name = f"{file_name}_summary"
|
211 |
-
individual_responses_filename = f"{file_name}_individual_responses"
|
212 |
-
summary_metrics.update(user_metadata)
|
213 |
-
results = LLMPerfResults(name=summary_file_name, metadata=summary_metrics)
|
214 |
-
results_dir = Path(results_dir)
|
215 |
-
if not results_dir.exists():
|
216 |
-
results_dir.mkdir(parents=True)
|
217 |
-
elif not results_dir.is_dir():
|
218 |
-
raise ValueError(f"{results_dir} is not a directory")
|
219 |
-
with open(results_dir / f"{summary_file_name}.json", "w") as f:
|
220 |
-
json.dump(results.to_dict(), f, indent=4)
|
221 |
-
with open(results_dir / f"{individual_responses_filename}.json", "w") as f:
|
222 |
-
json.dump(raw_results, f, indent=4)
|
223 |
-
|
224 |
-
|
225 |
-
args = argparse.ArgumentParser(description="Run a correctness test for a given model.")
|
226 |
-
|
227 |
-
args.add_argument(
|
228 |
-
"--model", type=str, required=True, help="The model to use for this load test."
|
229 |
-
)
|
230 |
-
args.add_argument(
|
231 |
-
"--num-concurrent-requests",
|
232 |
-
type=int,
|
233 |
-
default=10,
|
234 |
-
help=("The number of concurrent requests to send. (default: %(default)s)"),
|
235 |
-
)
|
236 |
-
args.add_argument(
|
237 |
-
"--timeout",
|
238 |
-
type=int,
|
239 |
-
default=90,
|
240 |
-
help="The amount of time to run the load test for. (default: %(default)s)",
|
241 |
-
)
|
242 |
-
args.add_argument(
|
243 |
-
"--max-num-completed-requests",
|
244 |
-
type=int,
|
245 |
-
default=50,
|
246 |
-
help=(
|
247 |
-
"The number of requests to complete before finishing the test. Note "
|
248 |
-
"that its possible for the test to timeout first. (default: %(default)s)"
|
249 |
-
),
|
250 |
-
)
|
251 |
-
args.add_argument(
|
252 |
-
"--additional-sampling-params",
|
253 |
-
type=str,
|
254 |
-
default="{}",
|
255 |
-
help=(
|
256 |
-
"Additional sampling params to send with the each request to the LLM API. "
|
257 |
-
"(default: %(default)s) No additional sampling params are sent."
|
258 |
-
),
|
259 |
-
)
|
260 |
-
args.add_argument(
|
261 |
-
"--results-dir",
|
262 |
-
type=str,
|
263 |
-
default="",
|
264 |
-
help=(
|
265 |
-
"The directory to save the results to. "
|
266 |
-
"(`default: %(default)s`) No results are saved)"
|
267 |
-
),
|
268 |
-
)
|
269 |
-
args.add_argument(
|
270 |
-
"--llm-api",
|
271 |
-
type=str,
|
272 |
-
default="openai",
|
273 |
-
help=(
|
274 |
-
f"The type of request to make. The supported llm apis are {SUPPORTED_APIS} "
|
275 |
-
" (`default: %(default)s`)"
|
276 |
-
),
|
277 |
-
)
|
278 |
-
args.add_argument(
|
279 |
-
"--metadata",
|
280 |
-
type=str,
|
281 |
-
default="",
|
282 |
-
help=(
|
283 |
-
"A comma separated list of metadata to include in the results, e.g. "
|
284 |
-
"name=foo,bar=1. These will be added to the metadata field of the results. "
|
285 |
-
),
|
286 |
-
)
|
287 |
-
|
288 |
-
if __name__ == "__main__":
|
289 |
-
args = args.parse_args()
|
290 |
-
|
291 |
-
env_vars = dict(os.environ)
|
292 |
-
ray.init(runtime_env={"env_vars": env_vars})
|
293 |
-
# Parse user metadata.
|
294 |
-
user_metadata = {}
|
295 |
-
if args.metadata:
|
296 |
-
for item in args.metadata.split(","):
|
297 |
-
key, value = item.split("=")
|
298 |
-
user_metadata[key] = value
|
299 |
-
|
300 |
-
run(
|
301 |
-
llm_api=args.llm_api,
|
302 |
-
model=args.model,
|
303 |
-
test_timeout_s=args.timeout,
|
304 |
-
max_num_completed_requests=args.max_num_completed_requests,
|
305 |
-
num_concurrent_requests=args.num_concurrent_requests,
|
306 |
-
additional_sampling_params=args.additional_sampling_params,
|
307 |
-
results_dir=args.results_dir,
|
308 |
-
user_metadata=user_metadata,
|
309 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llmperf/pre-commit.sh
DELETED
@@ -1,5 +0,0 @@
|
|
1 |
-
#!/bin/bash
|
2 |
-
echo "Running pre-hooks before committing..."
|
3 |
-
|
4 |
-
echo "======FORMAT====="
|
5 |
-
black . -q
|
|
|
|
|
|
|
|
|
|
|
|
llmperf/pyproject.toml
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
[build-system]
|
2 |
-
requires = ["setuptools>=43.0.0", "wheel"]
|
3 |
-
build-backend = "setuptools.build_meta"
|
4 |
-
|
5 |
-
[project]
|
6 |
-
name = "LLMPerf"
|
7 |
-
version = "0.1.0"
|
8 |
-
description = "A framework for load testing LLM APIs"
|
9 |
-
authors = [{name="Avnish Narayan", email="[email protected]"}]
|
10 |
-
license = {text= "Apache-2.0"}
|
11 |
-
requires-python = ">=3.8, <3.11"
|
12 |
-
dependencies = ["pydantic<2.5",
|
13 |
-
"ray",
|
14 |
-
"pytest>=6.0",
|
15 |
-
"seaborn>=0.11",
|
16 |
-
"awscli>=1.22",
|
17 |
-
"typer>=0.4",
|
18 |
-
"litellm>=0.1.738",
|
19 |
-
"num2words",
|
20 |
-
"transformers",
|
21 |
-
"tqdm",
|
22 |
-
"boto3",
|
23 |
-
"google-cloud-aiplatform"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llmperf/requirements-dev.txt
DELETED
@@ -1,2 +0,0 @@
|
|
1 |
-
# For lints
|
2 |
-
black
|
|
|
|
|
|
llmperf/src/llmperf/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
|
|
|
|
llmperf/src/llmperf/common.py
DELETED
@@ -1,38 +0,0 @@
|
|
1 |
-
from typing import List
|
2 |
-
from llmperf.ray_clients.litellm_client import LiteLLMClient
|
3 |
-
from llmperf.ray_clients.openai_chat_completions_client import (
|
4 |
-
OpenAIChatCompletionsClient,
|
5 |
-
)
|
6 |
-
from llmperf.ray_clients.sagemaker_client import SageMakerClient
|
7 |
-
from llmperf.ray_clients.vertexai_client import VertexAIClient
|
8 |
-
from llmperf.ray_llm_client import LLMClient
|
9 |
-
|
10 |
-
|
11 |
-
SUPPORTED_APIS = ["openai", "anthropic", "litellm"]
|
12 |
-
|
13 |
-
|
14 |
-
def construct_clients(llm_api: str, num_clients: int) -> List[LLMClient]:
|
15 |
-
"""Construct LLMClients that will be used to make requests to the LLM API.
|
16 |
-
|
17 |
-
Args:
|
18 |
-
llm_api: The name of the LLM API to use.
|
19 |
-
num_clients: The number of concurrent requests to make.
|
20 |
-
|
21 |
-
Returns:
|
22 |
-
The constructed LLMCLients
|
23 |
-
|
24 |
-
"""
|
25 |
-
if llm_api == "openai":
|
26 |
-
clients = [OpenAIChatCompletionsClient.remote() for _ in range(num_clients)]
|
27 |
-
elif llm_api == "sagemaker":
|
28 |
-
clients = [SageMakerClient.remote() for _ in range(num_clients)]
|
29 |
-
elif llm_api == "vertexai":
|
30 |
-
clients = [VertexAIClient.remote() for _ in range(num_clients)]
|
31 |
-
elif llm_api in SUPPORTED_APIS:
|
32 |
-
clients = [LiteLLMClient.remote() for _ in range(num_clients)]
|
33 |
-
else:
|
34 |
-
raise ValueError(
|
35 |
-
f"llm_api must be one of the supported LLM APIs: {SUPPORTED_APIS}"
|
36 |
-
)
|
37 |
-
|
38 |
-
return clients
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llmperf/src/llmperf/common_metrics.py
DELETED
@@ -1,17 +0,0 @@
|
|
1 |
-
# TODO (Avnishn): compute metrics in class
|
2 |
-
INTER_TOKEN_LAT = "inter_token_latency_s"
|
3 |
-
TTFT = "ttft_s"
|
4 |
-
E2E_LAT = "end_to_end_latency_s"
|
5 |
-
NUM_INPUT_TOKENS = "number_input_tokens"
|
6 |
-
NUM_OUTPUT_TOKENS = "number_output_tokens"
|
7 |
-
NUM_TOTAL_TOKENS = "number_total_tokens"
|
8 |
-
REQ_OUTPUT_THROUGHPUT = "request_output_throughput_token_per_s"
|
9 |
-
ERROR_MSG = "error_msg"
|
10 |
-
ERROR_CODE = "error_code"
|
11 |
-
ERROR_CODE_FREQ = "error_code_frequency"
|
12 |
-
NUM_ERRORS = "number_errors"
|
13 |
-
OUTPUT_THROUGHPUT = "mean_output_throughput_token_per_s"
|
14 |
-
NUM_COMPLETED_REQUESTS = "num_completed_requests"
|
15 |
-
COMPLETED_REQUESTS_PER_MIN = "num_completed_requests_per_min"
|
16 |
-
ERROR_RATE = "error_rate"
|
17 |
-
NUM_REQ_STARTED = "num_requests_started"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llmperf/src/llmperf/models.py
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
from typing import Any, Dict, List, Optional, Tuple
|
2 |
-
from pydantic import BaseModel
|
3 |
-
|
4 |
-
|
5 |
-
class RequestConfig(BaseModel):
|
6 |
-
"""The configuration for a request to the LLM API.
|
7 |
-
|
8 |
-
Args:
|
9 |
-
model: The model to use.
|
10 |
-
prompt: The prompt to provide to the LLM API.
|
11 |
-
sampling_params: Additional sampling parameters to send with the request.
|
12 |
-
For more information see the Router app's documentation for the completions
|
13 |
-
llm_api: The name of the LLM API to send the request to.
|
14 |
-
metadata: Additional metadata to attach to the request for logging or validation purposes.
|
15 |
-
"""
|
16 |
-
|
17 |
-
model: str
|
18 |
-
prompt: Tuple[str, int]
|
19 |
-
sampling_params: Optional[Dict[str, Any]] = None
|
20 |
-
llm_api: Optional[str] = None
|
21 |
-
metadata: Optional[Dict[str, Any]] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llmperf/src/llmperf/ray_clients/__init__.py
DELETED
File without changes
|
llmperf/src/llmperf/ray_clients/litellm_client.py
DELETED
@@ -1,100 +0,0 @@
|
|
1 |
-
import time
|
2 |
-
from typing import Any, Dict
|
3 |
-
import ray
|
4 |
-
|
5 |
-
from llmperf.ray_llm_client import LLMClient
|
6 |
-
from llmperf.models import RequestConfig
|
7 |
-
from llmperf import common_metrics
|
8 |
-
|
9 |
-
|
10 |
-
@ray.remote
|
11 |
-
class LiteLLMClient(LLMClient):
|
12 |
-
"""Client for LiteLLM Completions API."""
|
13 |
-
|
14 |
-
def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]:
|
15 |
-
# litellm package isn't serializable, so we import it within the function
|
16 |
-
# to maintain compatibility with ray.
|
17 |
-
from litellm import completion, validate_environment
|
18 |
-
|
19 |
-
prompt = request_config.prompt
|
20 |
-
prompt, prompt_len = prompt
|
21 |
-
|
22 |
-
message = [
|
23 |
-
{"role": "system", "content": ""},
|
24 |
-
{"role": "user", "content": prompt},
|
25 |
-
]
|
26 |
-
assert (
|
27 |
-
request_config.llm_api is not None
|
28 |
-
), "the request config's llm_api must be set."
|
29 |
-
if request_config.llm_api == "litellm":
|
30 |
-
model = request_config.model
|
31 |
-
else:
|
32 |
-
model = request_config.llm_api + "/" + request_config.model
|
33 |
-
validation_result = validate_environment(model)
|
34 |
-
if validation_result["missing_keys"]:
|
35 |
-
raise ValueError(
|
36 |
-
f"The following environment vars weren't found but were necessary for "
|
37 |
-
f"the model {request_config.model}: {validation_result['missing_keys']}"
|
38 |
-
)
|
39 |
-
body = {
|
40 |
-
"model": model,
|
41 |
-
"messages": message,
|
42 |
-
"stream": True,
|
43 |
-
}
|
44 |
-
sampling_params = request_config.sampling_params
|
45 |
-
body.update(sampling_params or {})
|
46 |
-
|
47 |
-
time_to_next_token = []
|
48 |
-
tokens_received = 0
|
49 |
-
ttft = 0
|
50 |
-
error_response_code = -1
|
51 |
-
generated_text = ""
|
52 |
-
error_msg = ""
|
53 |
-
output_throughput = 0
|
54 |
-
total_request_time = 0
|
55 |
-
|
56 |
-
metrics = {}
|
57 |
-
|
58 |
-
metrics[common_metrics.ERROR_CODE] = None
|
59 |
-
metrics[common_metrics.ERROR_MSG] = ""
|
60 |
-
|
61 |
-
try:
|
62 |
-
start_time = time.monotonic()
|
63 |
-
most_recent_received_token_time = time.monotonic()
|
64 |
-
|
65 |
-
response = completion(**body)
|
66 |
-
ttft = 0
|
67 |
-
for tok in response:
|
68 |
-
if tok.choices[0].delta:
|
69 |
-
delta = tok.choices[0].delta
|
70 |
-
if delta.get("content", None):
|
71 |
-
if ttft == 0:
|
72 |
-
ttft = time.monotonic() - start_time
|
73 |
-
time_to_next_token.append(ttft)
|
74 |
-
else:
|
75 |
-
time_to_next_token.append(
|
76 |
-
time.monotonic() - most_recent_received_token_time
|
77 |
-
)
|
78 |
-
generated_text += delta["content"]
|
79 |
-
most_recent_received_token_time = time.monotonic()
|
80 |
-
tokens_received += 1
|
81 |
-
|
82 |
-
total_request_time = time.monotonic() - start_time
|
83 |
-
|
84 |
-
output_throughput = tokens_received / total_request_time
|
85 |
-
|
86 |
-
except Exception as e:
|
87 |
-
metrics[common_metrics.ERROR_MSG] = error_msg
|
88 |
-
metrics[common_metrics.ERROR_CODE] = error_response_code
|
89 |
-
|
90 |
-
print(f"Warning Or Error: {e}")
|
91 |
-
print(error_response_code)
|
92 |
-
|
93 |
-
metrics[common_metrics.INTER_TOKEN_LAT] = sum(time_to_next_token)
|
94 |
-
metrics[common_metrics.TTFT] = ttft
|
95 |
-
metrics[common_metrics.E2E_LAT] = total_request_time
|
96 |
-
metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput
|
97 |
-
metrics[common_metrics.NUM_TOTAL_TOKENS] = tokens_received + prompt_len
|
98 |
-
metrics[common_metrics.NUM_OUTPUT_TOKENS] = tokens_received
|
99 |
-
metrics[common_metrics.NUM_INPUT_TOKENS] = prompt_len
|
100 |
-
return metrics, generated_text, request_config
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llmperf/src/llmperf/ray_clients/openai_chat_completions_client.py
DELETED
@@ -1,120 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
import time
|
4 |
-
from typing import Any, Dict
|
5 |
-
|
6 |
-
import ray
|
7 |
-
import requests
|
8 |
-
|
9 |
-
from llmperf.ray_llm_client import LLMClient
|
10 |
-
from llmperf.models import RequestConfig
|
11 |
-
from llmperf import common_metrics
|
12 |
-
|
13 |
-
|
14 |
-
@ray.remote
|
15 |
-
class OpenAIChatCompletionsClient(LLMClient):
|
16 |
-
"""Client for OpenAI Chat Completions API."""
|
17 |
-
|
18 |
-
def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]:
|
19 |
-
prompt = request_config.prompt
|
20 |
-
prompt, prompt_len = prompt
|
21 |
-
|
22 |
-
message = [
|
23 |
-
{"role": "system", "content": ""},
|
24 |
-
{"role": "user", "content": prompt},
|
25 |
-
]
|
26 |
-
model = request_config.model
|
27 |
-
body = {
|
28 |
-
"model": model,
|
29 |
-
"messages": message,
|
30 |
-
"stream": True,
|
31 |
-
}
|
32 |
-
sampling_params = request_config.sampling_params
|
33 |
-
body.update(sampling_params or {})
|
34 |
-
time_to_next_token = []
|
35 |
-
tokens_received = 0
|
36 |
-
ttft = 0
|
37 |
-
error_response_code = -1
|
38 |
-
generated_text = ""
|
39 |
-
error_msg = ""
|
40 |
-
output_throughput = 0
|
41 |
-
total_request_time = 0
|
42 |
-
|
43 |
-
metrics = {}
|
44 |
-
|
45 |
-
metrics[common_metrics.ERROR_CODE] = None
|
46 |
-
metrics[common_metrics.ERROR_MSG] = ""
|
47 |
-
|
48 |
-
start_time = time.monotonic()
|
49 |
-
most_recent_received_token_time = time.monotonic()
|
50 |
-
address = os.environ.get("OPENAI_API_BASE")
|
51 |
-
if not address:
|
52 |
-
raise ValueError("the environment variable OPENAI_API_BASE must be set.")
|
53 |
-
key = os.environ.get("OPENAI_API_KEY")
|
54 |
-
if not key:
|
55 |
-
raise ValueError("the environment variable OPENAI_API_KEY must be set.")
|
56 |
-
headers = {"Authorization": f"Bearer {key}"}
|
57 |
-
if not address:
|
58 |
-
raise ValueError("No host provided.")
|
59 |
-
if not address.endswith("/"):
|
60 |
-
address = address + "/"
|
61 |
-
address += "chat/completions"
|
62 |
-
try:
|
63 |
-
with requests.post(
|
64 |
-
address,
|
65 |
-
json=body,
|
66 |
-
stream=True,
|
67 |
-
timeout=180,
|
68 |
-
headers=headers,
|
69 |
-
) as response:
|
70 |
-
if response.status_code != 200:
|
71 |
-
error_msg = response.text
|
72 |
-
error_response_code = response.status_code
|
73 |
-
response.raise_for_status()
|
74 |
-
for chunk in response.iter_lines(chunk_size=None):
|
75 |
-
chunk = chunk.strip()
|
76 |
-
|
77 |
-
if not chunk:
|
78 |
-
continue
|
79 |
-
stem = "data: "
|
80 |
-
chunk = chunk[len(stem) :]
|
81 |
-
if chunk == b"[DONE]":
|
82 |
-
continue
|
83 |
-
tokens_received += 1
|
84 |
-
data = json.loads(chunk)
|
85 |
-
|
86 |
-
if "error" in data:
|
87 |
-
error_msg = data["error"]["message"]
|
88 |
-
error_response_code = data["error"]["code"]
|
89 |
-
raise RuntimeError(data["error"]["message"])
|
90 |
-
|
91 |
-
delta = data["choices"][0]["delta"]
|
92 |
-
if delta.get("content", None):
|
93 |
-
if not ttft:
|
94 |
-
ttft = time.monotonic() - start_time
|
95 |
-
time_to_next_token.append(ttft)
|
96 |
-
else:
|
97 |
-
time_to_next_token.append(
|
98 |
-
time.monotonic() - most_recent_received_token_time
|
99 |
-
)
|
100 |
-
most_recent_received_token_time = time.monotonic()
|
101 |
-
generated_text += delta["content"]
|
102 |
-
|
103 |
-
total_request_time = time.monotonic() - start_time
|
104 |
-
output_throughput = tokens_received / total_request_time
|
105 |
-
|
106 |
-
except Exception as e:
|
107 |
-
metrics[common_metrics.ERROR_MSG] = error_msg
|
108 |
-
metrics[common_metrics.ERROR_CODE] = error_response_code
|
109 |
-
print(f"Warning Or Error: {e}")
|
110 |
-
print(error_response_code)
|
111 |
-
|
112 |
-
metrics[common_metrics.INTER_TOKEN_LAT] = sum(time_to_next_token) #This should be same as metrics[common_metrics.E2E_LAT]. Leave it here for now
|
113 |
-
metrics[common_metrics.TTFT] = ttft
|
114 |
-
metrics[common_metrics.E2E_LAT] = total_request_time
|
115 |
-
metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput
|
116 |
-
metrics[common_metrics.NUM_TOTAL_TOKENS] = tokens_received + prompt_len
|
117 |
-
metrics[common_metrics.NUM_OUTPUT_TOKENS] = tokens_received
|
118 |
-
metrics[common_metrics.NUM_INPUT_TOKENS] = prompt_len
|
119 |
-
|
120 |
-
return metrics, generated_text, request_config
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llmperf/src/llmperf/ray_clients/sagemaker_client.py
DELETED
@@ -1,158 +0,0 @@
|
|
1 |
-
import io
|
2 |
-
import json
|
3 |
-
import os
|
4 |
-
import time
|
5 |
-
from typing import Any, Dict
|
6 |
-
|
7 |
-
import boto3
|
8 |
-
import ray
|
9 |
-
from transformers import LlamaTokenizerFast
|
10 |
-
|
11 |
-
from llmperf.ray_llm_client import LLMClient
|
12 |
-
from llmperf.models import RequestConfig
|
13 |
-
from llmperf import common_metrics
|
14 |
-
|
15 |
-
|
16 |
-
@ray.remote
|
17 |
-
class SageMakerClient(LLMClient):
|
18 |
-
"""Client for OpenAI Chat Completions API."""
|
19 |
-
|
20 |
-
def __init__(self):
|
21 |
-
# Sagemaker doesn't return the number of tokens that are generated so we approximate it by
|
22 |
-
# using the llama tokenizer.
|
23 |
-
self.tokenizer = LlamaTokenizerFast.from_pretrained(
|
24 |
-
"hf-internal-testing/llama-tokenizer"
|
25 |
-
)
|
26 |
-
|
27 |
-
def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]:
|
28 |
-
if not os.environ.get("AWS_ACCESS_KEY_ID"):
|
29 |
-
raise ValueError("AWS_ACCESS_KEY_ID must be set.")
|
30 |
-
if not os.environ.get("AWS_SECRET_ACCESS_KEY"):
|
31 |
-
raise ValueError("AWS_SECRET_ACCESS_KEY must be set.")
|
32 |
-
if not os.environ.get("AWS_REGION_NAME"):
|
33 |
-
raise ValueError("AWS_REGION_NAME must be set.")
|
34 |
-
|
35 |
-
prompt = request_config.prompt
|
36 |
-
prompt, prompt_len = prompt
|
37 |
-
|
38 |
-
message = [
|
39 |
-
{"role": "system", "content": ""},
|
40 |
-
{"role": "user", "content": prompt},
|
41 |
-
]
|
42 |
-
model = request_config.model
|
43 |
-
sm_runtime = boto3.client(
|
44 |
-
"sagemaker-runtime", region_name=os.environ.get("AWS_REGION_NAME")
|
45 |
-
)
|
46 |
-
|
47 |
-
sampling_params = request_config.sampling_params
|
48 |
-
|
49 |
-
if "max_tokens" in sampling_params:
|
50 |
-
sampling_params["max_new_tokens"] = sampling_params["max_tokens"]
|
51 |
-
del sampling_params["max_tokens"]
|
52 |
-
|
53 |
-
message = {
|
54 |
-
"inputs": [
|
55 |
-
[
|
56 |
-
{"role": "system", "content": ""},
|
57 |
-
{"role": "user", "content": prompt},
|
58 |
-
]
|
59 |
-
],
|
60 |
-
"parameters": {
|
61 |
-
**request_config.sampling_params,
|
62 |
-
},
|
63 |
-
}
|
64 |
-
|
65 |
-
time_to_next_token = []
|
66 |
-
tokens_received = 0
|
67 |
-
ttft = 0
|
68 |
-
error_response_code = None
|
69 |
-
generated_text = ""
|
70 |
-
error_msg = ""
|
71 |
-
output_throughput = 0
|
72 |
-
total_request_time = 0
|
73 |
-
metrics = {}
|
74 |
-
|
75 |
-
start_time = time.monotonic()
|
76 |
-
most_recent_received_token_time = time.monotonic()
|
77 |
-
|
78 |
-
try:
|
79 |
-
response = sm_runtime.invoke_endpoint_with_response_stream(
|
80 |
-
EndpointName=model,
|
81 |
-
ContentType="application/json",
|
82 |
-
Body=json.dumps(message),
|
83 |
-
CustomAttributes="accept_eula=true",
|
84 |
-
)
|
85 |
-
|
86 |
-
event_stream = response["Body"]
|
87 |
-
json_byte = b""
|
88 |
-
for line, ttft, _ in LineIterator(event_stream):
|
89 |
-
json_byte += line
|
90 |
-
time_to_next_token.append(
|
91 |
-
time.monotonic() - most_recent_received_token_time
|
92 |
-
)
|
93 |
-
most_recent_received_token_time = time.monotonic()
|
94 |
-
ttft = ttft - start_time
|
95 |
-
resp = json.loads(json_byte)
|
96 |
-
total_request_time = time.monotonic() - start_time
|
97 |
-
generated_text = resp[0]["generation"]["content"]
|
98 |
-
tokens_received = len(self.tokenizer.encode(generated_text))
|
99 |
-
output_throughput = tokens_received / total_request_time
|
100 |
-
|
101 |
-
except Exception as e:
|
102 |
-
print(f"Warning Or Error: {e}")
|
103 |
-
print(error_response_code)
|
104 |
-
error_msg = str(e)
|
105 |
-
error_response_code = 500
|
106 |
-
|
107 |
-
metrics[common_metrics.ERROR_MSG] = error_msg
|
108 |
-
metrics[common_metrics.ERROR_CODE] = error_response_code
|
109 |
-
metrics[common_metrics.INTER_TOKEN_LAT] = time_to_next_token
|
110 |
-
metrics[common_metrics.TTFT] = ttft
|
111 |
-
metrics[common_metrics.E2E_LAT] = total_request_time
|
112 |
-
metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput
|
113 |
-
metrics[common_metrics.NUM_TOTAL_TOKENS] = tokens_received + prompt_len
|
114 |
-
metrics[common_metrics.NUM_OUTPUT_TOKENS] = tokens_received
|
115 |
-
metrics[common_metrics.NUM_INPUT_TOKENS] = prompt_len
|
116 |
-
|
117 |
-
return metrics, generated_text, request_config
|
118 |
-
|
119 |
-
|
120 |
-
class LineIterator:
|
121 |
-
"""
|
122 |
-
A helper class for parsing the byte stream input.
|
123 |
-
Reference: https://aws.amazon.com/blogs/machine-learning/elevating-the-generative-ai-experience-introducing-streaming-support-in-amazon-sagemaker-hosting/
|
124 |
-
"""
|
125 |
-
|
126 |
-
def __init__(self, stream):
|
127 |
-
self.byte_iterator = iter(stream)
|
128 |
-
self.buffer = io.BytesIO()
|
129 |
-
self.read_pos = 0
|
130 |
-
self.ttft = 0
|
131 |
-
|
132 |
-
def __iter__(self):
|
133 |
-
return self
|
134 |
-
|
135 |
-
def __next__(self):
|
136 |
-
while True:
|
137 |
-
self.buffer.seek(self.read_pos)
|
138 |
-
line = self.buffer.readline()
|
139 |
-
if line and line[-1] == ord("\n"):
|
140 |
-
if self.ttft == 0:
|
141 |
-
self.ttft = time.monotonic()
|
142 |
-
self.read_pos += len(line)
|
143 |
-
return line[:-1], self.ttft, time.monotonic()
|
144 |
-
# kyle: dealing with last ']' for chat output
|
145 |
-
if line and self.read_pos == self.buffer.getbuffer().nbytes - 1:
|
146 |
-
self.read_pos += 1
|
147 |
-
return line, self.ttft, time.monotonic()
|
148 |
-
try:
|
149 |
-
chunk = next(self.byte_iterator)
|
150 |
-
except StopIteration:
|
151 |
-
if self.read_pos < self.buffer.getbuffer().nbytes:
|
152 |
-
continue
|
153 |
-
raise
|
154 |
-
if "PayloadPart" not in chunk:
|
155 |
-
print("Unknown event type:" + chunk)
|
156 |
-
continue
|
157 |
-
self.buffer.seek(0, io.SEEK_END)
|
158 |
-
self.buffer.write(chunk["PayloadPart"]["Bytes"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llmperf/src/llmperf/ray_clients/vertexai_client.py
DELETED
@@ -1,135 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
import time
|
4 |
-
from typing import Any, Dict
|
5 |
-
|
6 |
-
import ray
|
7 |
-
import requests
|
8 |
-
from transformers import LlamaTokenizerFast
|
9 |
-
|
10 |
-
from llmperf.ray_llm_client import LLMClient
|
11 |
-
from llmperf.models import RequestConfig
|
12 |
-
from llmperf import common_metrics
|
13 |
-
|
14 |
-
|
15 |
-
@ray.remote
|
16 |
-
class VertexAIClient(LLMClient):
|
17 |
-
"""Client for VertexAI API."""
|
18 |
-
|
19 |
-
def __init__(self):
|
20 |
-
# VertexAI doesn't return the number of tokens that are generated so we approximate it by
|
21 |
-
# using the llama tokenizer.
|
22 |
-
self.tokenizer = LlamaTokenizerFast.from_pretrained(
|
23 |
-
"hf-internal-testing/llama-tokenizer"
|
24 |
-
)
|
25 |
-
|
26 |
-
def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]:
|
27 |
-
project_id = os.environ.get("GCLOUD_PROJECT_ID")
|
28 |
-
region = os.environ.get("GCLOUD_REGION")
|
29 |
-
endpoint_id = os.environ.get("VERTEXAI_ENDPOINT_ID")
|
30 |
-
access_token = os.environ.get("GCLOUD_ACCESS_TOKEN").strip()
|
31 |
-
if not project_id:
|
32 |
-
raise ValueError("the environment variable GCLOUD_PROJECT_ID must be set.")
|
33 |
-
if not region:
|
34 |
-
raise ValueError("the environment variable GCLOUD_REGION must be set.")
|
35 |
-
if not endpoint_id:
|
36 |
-
raise ValueError(
|
37 |
-
"the environment variable VERTEXAI_ENDPOINT_ID must be set."
|
38 |
-
)
|
39 |
-
if not access_token:
|
40 |
-
raise ValueError(
|
41 |
-
"the environment variable GCLOUD_ACCESS_TOKEN must be set."
|
42 |
-
)
|
43 |
-
prompt = request_config.prompt
|
44 |
-
prompt, prompt_len = prompt
|
45 |
-
|
46 |
-
time_to_next_token = []
|
47 |
-
tokens_received = 0
|
48 |
-
ttft = 0
|
49 |
-
generated_text = ""
|
50 |
-
output_throughput = 0
|
51 |
-
total_request_time = 0
|
52 |
-
|
53 |
-
metrics = {}
|
54 |
-
|
55 |
-
metrics[common_metrics.ERROR_CODE] = None
|
56 |
-
metrics[common_metrics.ERROR_MSG] = ""
|
57 |
-
|
58 |
-
try:
|
59 |
-
# Define the URL for the request
|
60 |
-
url = (
|
61 |
-
f"https://{region}-aiplatform.googleapis.com/v1/projects/"
|
62 |
-
f"{project_id}/locations/{region}/endpoints/{endpoint_id}:predict"
|
63 |
-
)
|
64 |
-
|
65 |
-
# Define the headers
|
66 |
-
headers = {
|
67 |
-
"Authorization": f"Bearer {access_token}",
|
68 |
-
"Content-Type": "application/json",
|
69 |
-
}
|
70 |
-
|
71 |
-
sampling_params = request_config.sampling_params
|
72 |
-
if "max_new_tokens" in sampling_params:
|
73 |
-
sampling_params["maxOutputTokens"] = sampling_params.pop(
|
74 |
-
"max_new_tokens"
|
75 |
-
)
|
76 |
-
|
77 |
-
# Define the data payload
|
78 |
-
data = {"instances": [{"prompt": prompt}], "parameters": sampling_params}
|
79 |
-
|
80 |
-
# Make the POST request
|
81 |
-
start_time = time.monotonic()
|
82 |
-
response = requests.post(url, headers=headers, data=json.dumps(data))
|
83 |
-
total_request_time = time.monotonic() - start_time
|
84 |
-
response_code = response.status_code
|
85 |
-
response.raise_for_status()
|
86 |
-
# output from the endpoint is in the form:
|
87 |
-
# {"predictions": ["Input: ... \nOutput:\n ..."]}
|
88 |
-
generated_text = response.json()["predictions"][0].split("\nOutput:\n")[1]
|
89 |
-
tokens_received = len(self.tokenizer.encode(generated_text))
|
90 |
-
ttft = -1
|
91 |
-
output_throughput = tokens_received / total_request_time
|
92 |
-
time_to_next_token = [
|
93 |
-
total_request_time / tokens_received for _ in range(tokens_received)
|
94 |
-
]
|
95 |
-
|
96 |
-
except Exception as e:
|
97 |
-
metrics[common_metrics.ERROR_MSG] = str(e)
|
98 |
-
metrics[common_metrics.ERROR_CODE] = response_code
|
99 |
-
print(f"Warning Or Error: {e}")
|
100 |
-
print(response_code)
|
101 |
-
print(response_code)
|
102 |
-
|
103 |
-
metrics[common_metrics.INTER_TOKEN_LAT] = time_to_next_token
|
104 |
-
metrics[common_metrics.TTFT] = ttft
|
105 |
-
metrics[common_metrics.E2E_LAT] = total_request_time
|
106 |
-
metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput
|
107 |
-
metrics[common_metrics.NUM_TOTAL_TOKENS] = tokens_received + prompt_len
|
108 |
-
metrics[common_metrics.NUM_OUTPUT_TOKENS] = tokens_received
|
109 |
-
metrics[common_metrics.NUM_INPUT_TOKENS] = prompt_len
|
110 |
-
|
111 |
-
return metrics, generated_text, request_config
|
112 |
-
|
113 |
-
|
114 |
-
if __name__ == "__main__":
|
115 |
-
# Run these before hand:
|
116 |
-
|
117 |
-
# gcloud auth application-default login
|
118 |
-
# gcloud config set project YOUR_PROJECT_ID
|
119 |
-
# export GCLOUD_ACCESS_TOKEN=$(gcloud auth print-access-token)
|
120 |
-
# export GCLOUD_PROJECT_ID=YOUR_PROJECT_ID
|
121 |
-
# export GCLOUD_REGION=YOUR_REGION
|
122 |
-
# export VERTEXAI_ENDPOINT_ID=YOUR_ENDPOINT_ID
|
123 |
-
|
124 |
-
client = VertexAIClient.remote()
|
125 |
-
request_config = RequestConfig(
|
126 |
-
prompt=("Give me ten interview questions for the role of program manager.", 10),
|
127 |
-
model="gpt3",
|
128 |
-
sampling_params={
|
129 |
-
"temperature": 0.2,
|
130 |
-
"max_new_tokens": 256,
|
131 |
-
"top_k": 40,
|
132 |
-
"top_p": 0.95,
|
133 |
-
},
|
134 |
-
)
|
135 |
-
ray.get(client.llm_request.remote(request_config))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llmperf/src/llmperf/ray_llm_client.py
DELETED
@@ -1,22 +0,0 @@
|
|
1 |
-
import abc
|
2 |
-
from typing import Any, Dict, Tuple
|
3 |
-
|
4 |
-
from llmperf.models import RequestConfig
|
5 |
-
|
6 |
-
|
7 |
-
class LLMClient:
|
8 |
-
"""A client for making requests to a LLM API e.g Anyscale Endpoints."""
|
9 |
-
|
10 |
-
@abc.abstractmethod
|
11 |
-
def llm_request(
|
12 |
-
self, request_config: RequestConfig
|
13 |
-
) -> Tuple[Dict[str, Any], str, RequestConfig]:
|
14 |
-
"""Make a single completion request to a LLM API
|
15 |
-
|
16 |
-
Returns:
|
17 |
-
Metrics about the performance charateristics of the request.
|
18 |
-
The text generated by the request to the LLM API.
|
19 |
-
The request_config used to make the request. This is mainly for logging purposes.
|
20 |
-
|
21 |
-
"""
|
22 |
-
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llmperf/src/llmperf/requests_launcher.py
DELETED
@@ -1,48 +0,0 @@
|
|
1 |
-
from typing import Any, List
|
2 |
-
|
3 |
-
from llmperf.ray_llm_client import LLMClient
|
4 |
-
from llmperf.models import RequestConfig
|
5 |
-
from ray.util import ActorPool
|
6 |
-
|
7 |
-
|
8 |
-
class RequestsLauncher:
|
9 |
-
"""Launch requests from LLMClients to their respective LLM APIs."""
|
10 |
-
|
11 |
-
def __init__(self, llm_clients: List[LLMClient]):
|
12 |
-
self._llm_client_pool = ActorPool(llm_clients)
|
13 |
-
|
14 |
-
def launch_requests(self, request_config: RequestConfig) -> None:
|
15 |
-
"""Launch requests to the LLM API.
|
16 |
-
|
17 |
-
Args:
|
18 |
-
request_config: The configuration for the request.
|
19 |
-
|
20 |
-
"""
|
21 |
-
if self._llm_client_pool.has_free():
|
22 |
-
self._llm_client_pool.submit(
|
23 |
-
lambda client, _request_config: client.llm_request.remote(
|
24 |
-
_request_config
|
25 |
-
),
|
26 |
-
request_config,
|
27 |
-
)
|
28 |
-
|
29 |
-
def get_next_ready(self, block: bool = False) -> List[Any]:
|
30 |
-
"""Return results that are ready from completed requests.
|
31 |
-
|
32 |
-
Args:
|
33 |
-
block: Whether to block until a result is ready.
|
34 |
-
|
35 |
-
Returns:
|
36 |
-
A list of results that are ready.
|
37 |
-
|
38 |
-
"""
|
39 |
-
results = []
|
40 |
-
if not block:
|
41 |
-
while self._llm_client_pool.has_next():
|
42 |
-
results.append(self._llm_client_pool.get_next_unordered())
|
43 |
-
else:
|
44 |
-
while not self._llm_client_pool.has_next():
|
45 |
-
pass
|
46 |
-
while self._llm_client_pool.has_next():
|
47 |
-
results.append(self._llm_client_pool.get_next_unordered())
|
48 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llmperf/src/llmperf/sonnet.txt
DELETED
@@ -1,84 +0,0 @@
|
|
1 |
-
Shall I compare thee to a summer's day?
|
2 |
-
Thou art more lovely and more temperate:
|
3 |
-
Rough winds do shake the darling buds of May,
|
4 |
-
And summer's lease hath all too short a date:
|
5 |
-
Sometime too hot the eye of heaven shines,
|
6 |
-
And often is his gold complexion dimm'd;
|
7 |
-
And every fair from fair sometime declines,
|
8 |
-
By chance or nature's changing course untrimm'd;
|
9 |
-
But thy eternal summer shall not fade
|
10 |
-
Nor lose possession of that fair thou owest;
|
11 |
-
Nor shall Death brag thou wander'st in his shade,
|
12 |
-
When in eternal lines to time thou growest:
|
13 |
-
So long as men can breathe or eyes can see,
|
14 |
-
So long lives this and this gives life to thee.
|
15 |
-
Then let not winter's ragged hand deface
|
16 |
-
In thee thy summer, ere thou be distill'd:
|
17 |
-
Make sweet some vial; treasure thou some place
|
18 |
-
With beauty's treasure, ere it be self-kill'd.
|
19 |
-
That use is not forbidden usury,
|
20 |
-
Which happies those that pay the willing loan;
|
21 |
-
That's for thyself to breed another thee,
|
22 |
-
Or ten times happier, be it ten for one;
|
23 |
-
Ten times thyself were happier than thou art,
|
24 |
-
If ten of thine ten times refigured thee:
|
25 |
-
Then what could death do, if thou shouldst depart,
|
26 |
-
Leaving thee living in posterity?
|
27 |
-
Be not self-will'd, for thou art much too fair
|
28 |
-
To be death's conquest and make worms thine heir.
|
29 |
-
Where art thou, Muse, that thou forget'st so long
|
30 |
-
To speak of that which gives thee all thy might?
|
31 |
-
Spend'st thou thy fury on some worthless song,
|
32 |
-
Darkening thy power to lend base subjects light?
|
33 |
-
Return, forgetful Muse, and straight redeem
|
34 |
-
In gentle numbers time so idly spent;
|
35 |
-
Sing to the ear that doth thy lays esteem
|
36 |
-
And gives thy pen both skill and argument.
|
37 |
-
Rise, resty Muse, my love's sweet face survey,
|
38 |
-
If Time have any wrinkle graven there;
|
39 |
-
If any, be a satire to decay,
|
40 |
-
And make Time's spoils despised every where.
|
41 |
-
Give my love fame faster than Time wastes life;
|
42 |
-
So thou prevent'st his scythe and crooked knife.
|
43 |
-
My glass shall not persuade me I am old,
|
44 |
-
So long as youth and thou are of one date;
|
45 |
-
But when in thee time's furrows I behold,
|
46 |
-
Then look I death my days should expiate.
|
47 |
-
For all that beauty that doth cover thee
|
48 |
-
Is but the seemly raiment of my heart,
|
49 |
-
Which in thy breast doth live, as thine in me:
|
50 |
-
How can I then be elder than thou art?
|
51 |
-
O, therefore, love, be of thyself so wary
|
52 |
-
As I, not for myself, but for thee will;
|
53 |
-
Bearing thy heart, which I will keep so chary
|
54 |
-
As tender nurse her babe from faring ill.
|
55 |
-
Presume not on thy heart when mine is slain;
|
56 |
-
Thou gavest me thine, not to give back again.
|
57 |
-
So am I as the rich, whose blessed key
|
58 |
-
Can bring him to his sweet up-locked treasure,
|
59 |
-
The which he will not every hour survey,
|
60 |
-
For blunting the fine point of seldom pleasure.
|
61 |
-
Therefore are feasts so solemn and so rare,
|
62 |
-
Since, seldom coming, in the long year set,
|
63 |
-
Like stones of worth they thinly placed are,
|
64 |
-
Or captain jewels in the carcanet.
|
65 |
-
So is the time that keeps you as my chest,
|
66 |
-
Or as the wardrobe which the robe doth hide,
|
67 |
-
To make some special instant special blest,
|
68 |
-
By new unfolding his imprison'd pride.
|
69 |
-
Blessed are you, whose worthiness gives scope,
|
70 |
-
Being had, to triumph, being lack'd, to hope.
|
71 |
-
If there be nothing new, but that which is
|
72 |
-
Hath been before, how are our brains beguiled,
|
73 |
-
Which, labouring for invention, bear amiss
|
74 |
-
The second burden of a former child!
|
75 |
-
O, that record could with a backward look,
|
76 |
-
Even of five hundred courses of the sun,
|
77 |
-
Show me your image in some antique book,
|
78 |
-
Since mind at first in character was done!
|
79 |
-
That I might see what the old world could say
|
80 |
-
To this composed wonder of your frame;
|
81 |
-
Whether we are mended, or whether better they,
|
82 |
-
Or whether revolution be the same.
|
83 |
-
O, sure I am, the wits of former days
|
84 |
-
To subjects worse have given admiring praise.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llmperf/src/llmperf/utils.py
DELETED
@@ -1,147 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import math
|
3 |
-
import pathlib
|
4 |
-
import random
|
5 |
-
import subprocess
|
6 |
-
import time
|
7 |
-
from typing import Any, Dict, Tuple
|
8 |
-
|
9 |
-
from transformers import LlamaTokenizerFast
|
10 |
-
|
11 |
-
|
12 |
-
RESULTS_VERSION = "2023-08-31"
|
13 |
-
|
14 |
-
|
15 |
-
class LLMPerfResults:
|
16 |
-
def __init__(
|
17 |
-
self,
|
18 |
-
name: str,
|
19 |
-
metadata: Dict[str, Any] = None,
|
20 |
-
):
|
21 |
-
self.name = name
|
22 |
-
self.metadata = metadata or {}
|
23 |
-
self.timestamp = int(time.time())
|
24 |
-
self.metadata["timestamp"] = self.timestamp
|
25 |
-
self.version = RESULTS_VERSION
|
26 |
-
|
27 |
-
def to_dict(self):
|
28 |
-
data = {
|
29 |
-
"version": self.version,
|
30 |
-
"name": self.name,
|
31 |
-
}
|
32 |
-
data.update(self.metadata)
|
33 |
-
data = flatten_dict(data)
|
34 |
-
return data
|
35 |
-
|
36 |
-
def json(self):
|
37 |
-
data = self.to_dict()
|
38 |
-
return json.dumps(data)
|
39 |
-
|
40 |
-
|
41 |
-
def upload_to_s3(results_path: str, s3_path: str) -> None:
|
42 |
-
"""Upload the results to s3.
|
43 |
-
|
44 |
-
Args:
|
45 |
-
results_path: The path to the results file.
|
46 |
-
s3_path: The s3 path to upload the results to.
|
47 |
-
|
48 |
-
"""
|
49 |
-
|
50 |
-
command = ["aws", "s3", "sync", results_path, f"{s3_path}/"]
|
51 |
-
result = subprocess.run(command)
|
52 |
-
if result.returncode == 0:
|
53 |
-
print("Files uploaded successfully!")
|
54 |
-
else:
|
55 |
-
print("An error occurred:")
|
56 |
-
print(result.stderr)
|
57 |
-
|
58 |
-
|
59 |
-
def randomly_sample_sonnet_lines_prompt(
|
60 |
-
prompt_tokens_mean: int = 550,
|
61 |
-
prompt_tokens_stddev: int = 250,
|
62 |
-
expect_output_tokens: int = 150,
|
63 |
-
) -> Tuple[str, int]:
|
64 |
-
"""Generate a prompt that randomly samples lines from a the shakespeare sonnet at sonnet.txt.
|
65 |
-
|
66 |
-
Args:
|
67 |
-
prompt_length_mean: The mean length of the prompt to generate.
|
68 |
-
prompt_len_stddev: The standard deviation of the length of the prompt to generate.
|
69 |
-
expect_output_tokens: The number of tokens to expect in the output. This is used to
|
70 |
-
determine the length of the prompt. The prompt will be generated such that the output
|
71 |
-
will be approximately this many tokens.
|
72 |
-
|
73 |
-
Note:
|
74 |
-
tokens will be counted from the sonnet using the Llama tokenizer. Using one tokenizer
|
75 |
-
ensures a fairer comparison across different LLMs. For example, if gpt 3.5 tokenizes
|
76 |
-
a prompt in less tokens than Llama2, then this will be reflected in the results since
|
77 |
-
they will be fed identical prompts.
|
78 |
-
|
79 |
-
Returns:
|
80 |
-
A tuple of the prompt and the length of the prompt.
|
81 |
-
"""
|
82 |
-
|
83 |
-
tokenizer = LlamaTokenizerFast.from_pretrained(
|
84 |
-
"hf-internal-testing/llama-tokenizer"
|
85 |
-
)
|
86 |
-
|
87 |
-
get_token_length = lambda text: len(tokenizer.encode(text))
|
88 |
-
|
89 |
-
prompt = (
|
90 |
-
"Randomly stream lines from the following text "
|
91 |
-
f"with {expect_output_tokens} output tokens. "
|
92 |
-
"Don't generate eos tokens:\n\n"
|
93 |
-
)
|
94 |
-
# get a prompt length that is at least as long as the base
|
95 |
-
num_prompt_tokens = sample_random_positive_int(
|
96 |
-
prompt_tokens_mean, prompt_tokens_stddev
|
97 |
-
)
|
98 |
-
while num_prompt_tokens < get_token_length(prompt):
|
99 |
-
num_prompt_tokens = sample_random_positive_int(
|
100 |
-
prompt_tokens_mean, prompt_tokens_stddev
|
101 |
-
)
|
102 |
-
remaining_prompt_tokens = num_prompt_tokens - get_token_length(prompt)
|
103 |
-
sonnet_path = pathlib.Path(__file__).parent.resolve() / "sonnet.txt"
|
104 |
-
with open(sonnet_path, "r") as f:
|
105 |
-
sonnet_lines = f.readlines()
|
106 |
-
random.shuffle(sonnet_lines)
|
107 |
-
sampling_lines = True
|
108 |
-
while sampling_lines:
|
109 |
-
for line in sonnet_lines:
|
110 |
-
line_to_add = line
|
111 |
-
if remaining_prompt_tokens - get_token_length(line_to_add) < 0:
|
112 |
-
# This will cut off a line in the middle of a word, but that's ok since an
|
113 |
-
# llm should be able to handle that.
|
114 |
-
line_to_add = line_to_add[: int(math.ceil(remaining_prompt_tokens))]
|
115 |
-
sampling_lines = False
|
116 |
-
prompt += line_to_add
|
117 |
-
break
|
118 |
-
prompt += line_to_add
|
119 |
-
remaining_prompt_tokens -= get_token_length(line_to_add)
|
120 |
-
return (prompt, num_prompt_tokens)
|
121 |
-
|
122 |
-
|
123 |
-
def sample_random_positive_int(mean: int, stddev: int) -> int:
|
124 |
-
"""Sample random numbers from a gaussian distribution until a positive number is sampled.
|
125 |
-
|
126 |
-
Args:
|
127 |
-
mean: The mean of the gaussian distribution to sample from.
|
128 |
-
stddev: The standard deviation of the gaussian distribution to sample from.
|
129 |
-
|
130 |
-
Returns:
|
131 |
-
A random positive integer sampled from the gaussian distribution.
|
132 |
-
"""
|
133 |
-
ret = -1
|
134 |
-
while ret <= 0:
|
135 |
-
ret = int(random.gauss(mean, stddev))
|
136 |
-
return ret
|
137 |
-
|
138 |
-
|
139 |
-
def flatten_dict(d, parent_key="", sep="_"):
|
140 |
-
items = []
|
141 |
-
for k, v in d.items():
|
142 |
-
new_key = f"{parent_key}{sep}{k}" if parent_key else k
|
143 |
-
if isinstance(v, dict):
|
144 |
-
items.extend(flatten_dict(v, new_key, sep=sep).items())
|
145 |
-
else:
|
146 |
-
items.append((new_key, v))
|
147 |
-
return dict(items)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llmperf/token_benchmark_ray.py
DELETED
@@ -1,469 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
from collections.abc import Iterable
|
3 |
-
import json
|
4 |
-
import os
|
5 |
-
from pathlib import Path
|
6 |
-
import re
|
7 |
-
import time
|
8 |
-
import random
|
9 |
-
from typing import Any, Dict, List, Optional, Tuple
|
10 |
-
|
11 |
-
import pandas as pd
|
12 |
-
import ray
|
13 |
-
|
14 |
-
from llmperf import common_metrics
|
15 |
-
from llmperf.common import SUPPORTED_APIS, construct_clients
|
16 |
-
|
17 |
-
from llmperf.models import RequestConfig
|
18 |
-
from llmperf.requests_launcher import RequestsLauncher
|
19 |
-
from llmperf.utils import (
|
20 |
-
randomly_sample_sonnet_lines_prompt,
|
21 |
-
LLMPerfResults,
|
22 |
-
sample_random_positive_int,
|
23 |
-
)
|
24 |
-
from tqdm import tqdm
|
25 |
-
|
26 |
-
from transformers import LlamaTokenizerFast
|
27 |
-
|
28 |
-
def get_token_throughput_latencies(
|
29 |
-
model: str,
|
30 |
-
mean_input_tokens: int,
|
31 |
-
stddev_input_tokens: int,
|
32 |
-
mean_output_tokens: int,
|
33 |
-
stddev_output_tokens: int,
|
34 |
-
additional_sampling_params: Optional[Dict[str, Any]] = None,
|
35 |
-
num_concurrent_requests: int = 1,
|
36 |
-
max_num_completed_requests: int = 500,
|
37 |
-
test_timeout_s=90,
|
38 |
-
llm_api="openai",
|
39 |
-
) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
|
40 |
-
"""Get the token throughput and latencies for the given model.
|
41 |
-
|
42 |
-
Args:
|
43 |
-
model: The name of the model to query.
|
44 |
-
mean_input_tokens: The mean number of tokens to send in the prompt for the request.
|
45 |
-
stddev_input_tokens: The standard deviation of the number of tokens to send in the prompt for the request.
|
46 |
-
mean_output_tokens: The mean number of tokens to generate per request.
|
47 |
-
stddev_output_tokens: The standard deviation of the number of tokens to generate per request.
|
48 |
-
additional_sampling_params: Additional sampling parameters to send with the request.
|
49 |
-
For more information see the LLM APIs documentation for the completions
|
50 |
-
num_concurrent_requests: The number of concurrent requests to make. Increase
|
51 |
-
this to increase the amount of load and vice versa.
|
52 |
-
test_timeout_s: The amount of time to run the test for before reporting results.
|
53 |
-
llm_api: The name of the llm api to use. Either "openai" or "litellm".
|
54 |
-
|
55 |
-
Returns:
|
56 |
-
A summary of the performance metrics collected across all completed requests
|
57 |
-
(e.g. throughput, latencies, etc.)
|
58 |
-
The individual metrics for each request.
|
59 |
-
"""
|
60 |
-
random.seed(11111)
|
61 |
-
|
62 |
-
tokenizer = LlamaTokenizerFast.from_pretrained(
|
63 |
-
"hf-internal-testing/llama-tokenizer"
|
64 |
-
)
|
65 |
-
get_token_length = lambda text: len(tokenizer.encode(text))
|
66 |
-
|
67 |
-
if not additional_sampling_params:
|
68 |
-
additional_sampling_params = {}
|
69 |
-
|
70 |
-
clients = construct_clients(llm_api=llm_api, num_clients=num_concurrent_requests)
|
71 |
-
req_launcher = RequestsLauncher(clients)
|
72 |
-
completed_requests = []
|
73 |
-
num_completed_requests = 0
|
74 |
-
start_time = time.monotonic()
|
75 |
-
iter = 0
|
76 |
-
pbar = tqdm(total=max_num_completed_requests)
|
77 |
-
while (
|
78 |
-
time.monotonic() - start_time < test_timeout_s
|
79 |
-
and len(completed_requests) < max_num_completed_requests
|
80 |
-
):
|
81 |
-
iter += 1
|
82 |
-
num_output_tokens = sample_random_positive_int(
|
83 |
-
mean_output_tokens, stddev_output_tokens
|
84 |
-
)
|
85 |
-
|
86 |
-
prompt = randomly_sample_sonnet_lines_prompt(
|
87 |
-
prompt_tokens_mean=mean_input_tokens,
|
88 |
-
prompt_tokens_stddev=stddev_input_tokens,
|
89 |
-
expect_output_tokens=num_output_tokens,
|
90 |
-
)
|
91 |
-
|
92 |
-
default_sampling_params = {"max_tokens": num_output_tokens}
|
93 |
-
default_sampling_params.update(additional_sampling_params)
|
94 |
-
request_config = RequestConfig(
|
95 |
-
model=model,
|
96 |
-
prompt=prompt,
|
97 |
-
sampling_params=default_sampling_params,
|
98 |
-
llm_api=llm_api,
|
99 |
-
)
|
100 |
-
req_launcher.launch_requests(request_config)
|
101 |
-
# Retrieving results less frequently allows for more concurrent requests
|
102 |
-
# to be launched. This will overall reduce the amount of time it takes
|
103 |
-
# for the test to run.
|
104 |
-
if not (iter % num_concurrent_requests):
|
105 |
-
outs = req_launcher.get_next_ready()
|
106 |
-
all_metrics = []
|
107 |
-
for out in outs:
|
108 |
-
request_metrics, gen_text, _ = out
|
109 |
-
num_output_tokens = get_token_length(gen_text)
|
110 |
-
if num_output_tokens:
|
111 |
-
request_metrics[common_metrics.INTER_TOKEN_LAT] /= num_output_tokens
|
112 |
-
else:
|
113 |
-
request_metrics[common_metrics.INTER_TOKEN_LAT] = 0
|
114 |
-
request_metrics[common_metrics.NUM_OUTPUT_TOKENS] = num_output_tokens
|
115 |
-
request_metrics[common_metrics.NUM_TOTAL_TOKENS] = request_metrics[common_metrics.NUM_INPUT_TOKENS] + num_output_tokens
|
116 |
-
request_metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = num_output_tokens / request_metrics[common_metrics.E2E_LAT]
|
117 |
-
all_metrics.append(request_metrics)
|
118 |
-
completed_requests.extend(all_metrics)
|
119 |
-
pbar.update(len(completed_requests) - num_completed_requests)
|
120 |
-
num_completed_requests = len(completed_requests)
|
121 |
-
|
122 |
-
pbar.close()
|
123 |
-
end_time = time.monotonic()
|
124 |
-
if end_time - start_time >= test_timeout_s:
|
125 |
-
print("Test timed out before all requests could be completed.")
|
126 |
-
|
127 |
-
# check one last time that there are no remaining results to collect.
|
128 |
-
outs = req_launcher.get_next_ready()
|
129 |
-
all_metrics = []
|
130 |
-
for out in outs:
|
131 |
-
request_metrics, gen_text, _ = out
|
132 |
-
num_output_tokens = get_token_length(gen_text)
|
133 |
-
if num_output_tokens:
|
134 |
-
request_metrics[common_metrics.INTER_TOKEN_LAT] /= num_output_tokens
|
135 |
-
else:
|
136 |
-
request_metrics[common_metrics.INTER_TOKEN_LAT] = 0
|
137 |
-
request_metrics[common_metrics.NUM_OUTPUT_TOKENS] = num_output_tokens
|
138 |
-
request_metrics[common_metrics.NUM_TOTAL_TOKENS] = request_metrics[common_metrics.NUM_INPUT_TOKENS] + num_output_tokens
|
139 |
-
request_metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = num_output_tokens / request_metrics[common_metrics.E2E_LAT]
|
140 |
-
|
141 |
-
all_metrics.append(request_metrics)
|
142 |
-
completed_requests.extend(all_metrics)
|
143 |
-
|
144 |
-
print(f"\Results for token benchmark for {model} queried with the {llm_api} api.\n")
|
145 |
-
ret = metrics_summary(completed_requests, start_time, end_time)
|
146 |
-
|
147 |
-
metadata = {
|
148 |
-
"model": model,
|
149 |
-
"mean_input_tokens": mean_input_tokens,
|
150 |
-
"stddev_input_tokens": stddev_input_tokens,
|
151 |
-
"mean_output_tokens": mean_output_tokens,
|
152 |
-
"stddev_output_tokens": stddev_output_tokens,
|
153 |
-
"num_concurrent_requests": num_concurrent_requests,
|
154 |
-
"additional_sampling_params": additional_sampling_params,
|
155 |
-
}
|
156 |
-
|
157 |
-
metadata["results"] = ret
|
158 |
-
|
159 |
-
return metadata, completed_requests
|
160 |
-
|
161 |
-
|
162 |
-
def metrics_summary(
|
163 |
-
metrics: List[Dict[str, Any]], start_time: int, end_time: int
|
164 |
-
) -> Dict[str, Any]:
|
165 |
-
"""Generate a summary over metrics generated from potentially multiple instances of this client.
|
166 |
-
|
167 |
-
Args:
|
168 |
-
metrics: The metrics to summarize.
|
169 |
-
start_time: The time the test started.
|
170 |
-
end_time: The time the test ended.
|
171 |
-
|
172 |
-
Returns:
|
173 |
-
A summary with the following information:
|
174 |
-
- Overall throughput (generated tokens / total test time)
|
175 |
-
- Number of completed requests
|
176 |
-
- Error rate
|
177 |
-
- Error code frequency
|
178 |
-
- Quantiles (p25-p99) for the following metrics:
|
179 |
-
- Inter token latency
|
180 |
-
- Time to first token
|
181 |
-
- User total request time
|
182 |
-
- Number of tokens processed per request
|
183 |
-
- Number of tokens generated per request
|
184 |
-
- User throughput (tokens / s)
|
185 |
-
"""
|
186 |
-
ret = {}
|
187 |
-
|
188 |
-
def flatten(item):
|
189 |
-
for sub_item in item:
|
190 |
-
if isinstance(sub_item, Iterable) and not isinstance(sub_item, str):
|
191 |
-
yield from flatten(sub_item)
|
192 |
-
else:
|
193 |
-
yield sub_item
|
194 |
-
|
195 |
-
df = pd.DataFrame(metrics)
|
196 |
-
df_without_errored_req = df[df[common_metrics.ERROR_CODE].isna()]
|
197 |
-
|
198 |
-
for key in [
|
199 |
-
common_metrics.INTER_TOKEN_LAT,
|
200 |
-
common_metrics.TTFT,
|
201 |
-
common_metrics.E2E_LAT,
|
202 |
-
common_metrics.REQ_OUTPUT_THROUGHPUT,
|
203 |
-
common_metrics.NUM_INPUT_TOKENS,
|
204 |
-
common_metrics.NUM_OUTPUT_TOKENS
|
205 |
-
]:
|
206 |
-
print(key)
|
207 |
-
ret[key] = {}
|
208 |
-
series = pd.Series(list(flatten(df_without_errored_req[key]))).dropna()
|
209 |
-
quantiles = series.quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).to_dict()
|
210 |
-
quantiles_reformatted_keys = {}
|
211 |
-
for quantile, value in quantiles.items():
|
212 |
-
reformatted_key = f"p{int(quantile * 100)}"
|
213 |
-
print(f" {reformatted_key} = {value}")
|
214 |
-
quantiles_reformatted_keys[reformatted_key] = value
|
215 |
-
ret[key]["quantiles"] = quantiles_reformatted_keys
|
216 |
-
mean = series.mean()
|
217 |
-
print(f" mean = {mean}")
|
218 |
-
ret[key]["mean"] = mean
|
219 |
-
print(f" min = {series.min()}")
|
220 |
-
ret[key]["min"] = series.min()
|
221 |
-
print(f" max = {series.max()}")
|
222 |
-
ret[key]["max"] = series.max()
|
223 |
-
print(f" stddev = {series.std()}")
|
224 |
-
ret[key]["stddev"] = series.std()
|
225 |
-
|
226 |
-
ret[common_metrics.NUM_REQ_STARTED] = len(metrics)
|
227 |
-
|
228 |
-
error_codes = df[common_metrics.ERROR_CODE].dropna()
|
229 |
-
num_errors = len(error_codes)
|
230 |
-
ret[common_metrics.ERROR_RATE] = num_errors / len(metrics) if len(metrics) else 0
|
231 |
-
ret[common_metrics.NUM_ERRORS] = num_errors
|
232 |
-
print(f"Number Of Errored Requests: {num_errors}")
|
233 |
-
error_code_frequency = dict(error_codes.value_counts())
|
234 |
-
if num_errors:
|
235 |
-
error_code_frequency = dict(error_codes.value_counts())
|
236 |
-
print("Error Code Frequency")
|
237 |
-
print(error_code_frequency)
|
238 |
-
ret[common_metrics.ERROR_CODE_FREQ] = str(error_code_frequency)
|
239 |
-
|
240 |
-
overall_output_throughput = df_without_errored_req[
|
241 |
-
common_metrics.NUM_OUTPUT_TOKENS
|
242 |
-
].sum() / (end_time - start_time)
|
243 |
-
|
244 |
-
print(f"Overall Output Throughput: {overall_output_throughput}")
|
245 |
-
ret[common_metrics.OUTPUT_THROUGHPUT] = overall_output_throughput
|
246 |
-
|
247 |
-
num_completed_requests = len(df_without_errored_req)
|
248 |
-
num_completed_requests_per_min = (
|
249 |
-
num_completed_requests / (end_time - start_time) * 60
|
250 |
-
)
|
251 |
-
print(f"Number Of Completed Requests: {num_completed_requests}")
|
252 |
-
print(f"Completed Requests Per Minute: {num_completed_requests_per_min}")
|
253 |
-
|
254 |
-
ret[common_metrics.NUM_COMPLETED_REQUESTS] = num_completed_requests
|
255 |
-
ret[common_metrics.COMPLETED_REQUESTS_PER_MIN] = num_completed_requests_per_min
|
256 |
-
|
257 |
-
return ret
|
258 |
-
|
259 |
-
|
260 |
-
def run_token_benchmark(
|
261 |
-
llm_api: str,
|
262 |
-
model: str,
|
263 |
-
test_timeout_s: int,
|
264 |
-
max_num_completed_requests: int,
|
265 |
-
num_concurrent_requests: int,
|
266 |
-
mean_input_tokens: int,
|
267 |
-
stddev_input_tokens: int,
|
268 |
-
mean_output_tokens: int,
|
269 |
-
stddev_output_tokens: int,
|
270 |
-
additional_sampling_params: str,
|
271 |
-
results_dir: str,
|
272 |
-
user_metadata: Dict[str, Any],
|
273 |
-
):
|
274 |
-
"""
|
275 |
-
Args:
|
276 |
-
llm_api: The name of the llm api to use.
|
277 |
-
model: The name of the model to query.
|
278 |
-
max_num_completed_requests: The number of requests to complete before finishing the test.
|
279 |
-
test_timeout_s: The amount of time to run the test for before reporting results.
|
280 |
-
num_concurrent_requests: The number of concurrent requests to make. Increase
|
281 |
-
this to increase the amount of load and vice versa.
|
282 |
-
mean_input_tokens: The mean number of tokens to send in the prompt for the request.
|
283 |
-
stddev_input_tokens: The standard deviation of the number of tokens to send in the prompt for the request.
|
284 |
-
mean_output_tokens: The mean number of tokens to generate per request.
|
285 |
-
stddev_output_tokens: The standard deviation of the number of tokens to generate per request.
|
286 |
-
additional_sampling_params: Additional sampling parameters to send with the request.
|
287 |
-
For more information see the LLM APIs documentation for the completions.
|
288 |
-
results_dir: The directory to save the results to.
|
289 |
-
user_metadata: Additional metadata to include in the results.
|
290 |
-
"""
|
291 |
-
if mean_input_tokens < 40:
|
292 |
-
print(
|
293 |
-
"the minimum number of input tokens that will be sent is 41"
|
294 |
-
" because of the prompting logic right now"
|
295 |
-
)
|
296 |
-
|
297 |
-
summary, individual_responses = get_token_throughput_latencies(
|
298 |
-
model=model,
|
299 |
-
llm_api=llm_api,
|
300 |
-
test_timeout_s=test_timeout_s,
|
301 |
-
max_num_completed_requests=max_num_completed_requests,
|
302 |
-
mean_input_tokens=mean_input_tokens,
|
303 |
-
stddev_input_tokens=stddev_input_tokens,
|
304 |
-
mean_output_tokens=mean_output_tokens,
|
305 |
-
stddev_output_tokens=stddev_output_tokens,
|
306 |
-
num_concurrent_requests=num_concurrent_requests,
|
307 |
-
additional_sampling_params=json.loads(additional_sampling_params),
|
308 |
-
)
|
309 |
-
|
310 |
-
if results_dir:
|
311 |
-
filename = f"{model}_{mean_input_tokens}_{mean_output_tokens}"
|
312 |
-
filename = re.sub(r"[^\w\d-]+", "-", filename)
|
313 |
-
filename = re.sub(r"-{2,}", "-", filename)
|
314 |
-
summary_filename = f"{filename}_summary"
|
315 |
-
individual_responses_filename = f"{filename}_individual_responses"
|
316 |
-
|
317 |
-
# Update to metadata.
|
318 |
-
summary.update(user_metadata)
|
319 |
-
|
320 |
-
results = LLMPerfResults(name=summary_filename, metadata=summary)
|
321 |
-
results_dir = Path(results_dir)
|
322 |
-
if not results_dir.exists():
|
323 |
-
results_dir.mkdir(parents=True)
|
324 |
-
elif not results_dir.is_dir():
|
325 |
-
raise ValueError(f"{results_dir} is not a directory")
|
326 |
-
|
327 |
-
try:
|
328 |
-
with open(results_dir / f"{summary_filename}.json", "w") as f:
|
329 |
-
json.dump(results.to_dict(), f, indent=4, default=str)
|
330 |
-
except Exception as e:
|
331 |
-
print(results.to_dict())
|
332 |
-
raise e
|
333 |
-
|
334 |
-
try:
|
335 |
-
with open(results_dir / f"{individual_responses_filename}.json", "w") as f:
|
336 |
-
json.dump(individual_responses, f, indent=4)
|
337 |
-
except Exception as e:
|
338 |
-
print(individual_responses)
|
339 |
-
raise e
|
340 |
-
|
341 |
-
|
342 |
-
args = argparse.ArgumentParser(
|
343 |
-
description="Run a token throughput and latency benchmark."
|
344 |
-
)
|
345 |
-
|
346 |
-
args.add_argument(
|
347 |
-
"--model", type=str, required=True, help="The model to use for this load test."
|
348 |
-
)
|
349 |
-
args.add_argument(
|
350 |
-
"--mean-input-tokens",
|
351 |
-
type=int,
|
352 |
-
default=550,
|
353 |
-
help=(
|
354 |
-
"The mean number of tokens to send in the prompt for the request. "
|
355 |
-
" (default: %(default)s)"
|
356 |
-
),
|
357 |
-
)
|
358 |
-
args.add_argument(
|
359 |
-
"--stddev-input-tokens",
|
360 |
-
type=int,
|
361 |
-
default=150,
|
362 |
-
help=(
|
363 |
-
"The standard deviation of number of tokens to send in the prompt for the request. "
|
364 |
-
"(default: %(default)s)"
|
365 |
-
),
|
366 |
-
)
|
367 |
-
args.add_argument(
|
368 |
-
"--mean-output-tokens",
|
369 |
-
type=int,
|
370 |
-
default=150,
|
371 |
-
help=(
|
372 |
-
"The mean number of tokens to generate from each llm request. This is the max_tokens param "
|
373 |
-
"for the completions API. Note that this is not always the number of tokens returned. "
|
374 |
-
"(default: %(default)s)"
|
375 |
-
),
|
376 |
-
)
|
377 |
-
args.add_argument(
|
378 |
-
"--stddev-output-tokens",
|
379 |
-
type=int,
|
380 |
-
default=80,
|
381 |
-
help=(
|
382 |
-
"The stdandard deviation on the number of tokens to generate per llm request. "
|
383 |
-
"(default: %(default)s)"
|
384 |
-
),
|
385 |
-
)
|
386 |
-
args.add_argument(
|
387 |
-
"--num-concurrent-requests",
|
388 |
-
type=int,
|
389 |
-
default=10,
|
390 |
-
help=("The number of concurrent requests to send (default: %(default)s)"),
|
391 |
-
)
|
392 |
-
args.add_argument(
|
393 |
-
"--timeout",
|
394 |
-
type=int,
|
395 |
-
default=90,
|
396 |
-
help="The amount of time to run the load test for. (default: %(default)s)",
|
397 |
-
)
|
398 |
-
args.add_argument(
|
399 |
-
"--max-num-completed-requests",
|
400 |
-
type=int,
|
401 |
-
default=10,
|
402 |
-
help=(
|
403 |
-
"The number of requests to complete before finishing the test. Note "
|
404 |
-
"that its possible for the test to timeout first. (default: %(default)s)"
|
405 |
-
),
|
406 |
-
)
|
407 |
-
args.add_argument(
|
408 |
-
"--additional-sampling-params",
|
409 |
-
type=str,
|
410 |
-
default="{}",
|
411 |
-
help=(
|
412 |
-
"Additional sampling params to send with the each request to the LLM API. "
|
413 |
-
"(default: %(default)s) No additional sampling params are sent."
|
414 |
-
),
|
415 |
-
)
|
416 |
-
args.add_argument(
|
417 |
-
"--results-dir",
|
418 |
-
type=str,
|
419 |
-
default="",
|
420 |
-
help=(
|
421 |
-
"The directory to save the results to. "
|
422 |
-
"(`default: %(default)s`) No results are saved)"
|
423 |
-
),
|
424 |
-
)
|
425 |
-
args.add_argument(
|
426 |
-
"--llm-api",
|
427 |
-
type=str,
|
428 |
-
default="openai",
|
429 |
-
help=(
|
430 |
-
f"The name of the llm api to use. Can select from {SUPPORTED_APIS}"
|
431 |
-
" (default: %(default)s)"
|
432 |
-
),
|
433 |
-
)
|
434 |
-
args.add_argument(
|
435 |
-
"--metadata",
|
436 |
-
type=str,
|
437 |
-
default="",
|
438 |
-
help=(
|
439 |
-
"A comma separated list of metadata to include in the results, e.g. "
|
440 |
-
"name=foo,bar=1. These will be added to the metadata field of the results. "
|
441 |
-
),
|
442 |
-
)
|
443 |
-
|
444 |
-
if __name__ == "__main__":
|
445 |
-
env_vars = dict(os.environ)
|
446 |
-
ray.init(runtime_env={"env_vars": env_vars})
|
447 |
-
args = args.parse_args()
|
448 |
-
|
449 |
-
# Parse user metadata.
|
450 |
-
user_metadata = {}
|
451 |
-
if args.metadata:
|
452 |
-
for item in args.metadata.split(","):
|
453 |
-
key, value = item.split("=")
|
454 |
-
user_metadata[key] = value
|
455 |
-
|
456 |
-
run_token_benchmark(
|
457 |
-
llm_api=args.llm_api,
|
458 |
-
model=args.model,
|
459 |
-
test_timeout_s=args.timeout,
|
460 |
-
max_num_completed_requests=args.max_num_completed_requests,
|
461 |
-
mean_input_tokens=args.mean_input_tokens,
|
462 |
-
stddev_input_tokens=args.stddev_input_tokens,
|
463 |
-
mean_output_tokens=args.mean_output_tokens,
|
464 |
-
stddev_output_tokens=args.stddev_output_tokens,
|
465 |
-
num_concurrent_requests=args.num_concurrent_requests,
|
466 |
-
additional_sampling_params=args.additional_sampling_params,
|
467 |
-
results_dir=args.results_dir,
|
468 |
-
user_metadata=user_metadata,
|
469 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
on_startup.sh
CHANGED
@@ -14,6 +14,12 @@ git config --global credential.helper store
|
|
14 |
## Remove the temporary clone directory
|
15 |
#rm -rf /tmp/tgi-benchmark-notebooks
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
# Add dark theme
|
18 |
mkdir -p ~/.jupyter/lab/user-settings/@jupyterlab/apputils-extension/ && \
|
19 |
echo '{ "theme":"JupyterLab Dark" }' > ~/.jupyter/lab/user-settings/@jupyterlab/apputils-extension/themes.jupyterlab-settings
|
|
|
14 |
## Remove the temporary clone directory
|
15 |
#rm -rf /tmp/tgi-benchmark-notebooks
|
16 |
|
17 |
+
# Install llmperf
|
18 |
+
cd ~/app
|
19 |
+
git clone https://github.com/ray-project/llmperf.git
|
20 |
+
cd llmperf
|
21 |
+
git checkout afd137a
|
22 |
+
|
23 |
# Add dark theme
|
24 |
mkdir -p ~/.jupyter/lab/user-settings/@jupyterlab/apputils-extension/ && \
|
25 |
echo '{ "theme":"JupyterLab Dark" }' > ~/.jupyter/lab/user-settings/@jupyterlab/apputils-extension/themes.jupyterlab-settings
|
requirements.txt
CHANGED
@@ -3,9 +3,10 @@ jupyterlab-vim==0.15.1
|
|
3 |
jupyterlab-vimrc==0.5.2
|
4 |
jupyter-server==2.3.0
|
5 |
tornado==6.2
|
6 |
-
ipywidgets
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
3 |
jupyterlab-vimrc==0.5.2
|
4 |
jupyter-server==2.3.0
|
5 |
tornado==6.2
|
6 |
+
ipywidgets==8.1.3
|
7 |
+
huggingface-hub==0.23.2
|
8 |
+
transformers==4.41.2
|
9 |
+
pandas==2.2.2
|
10 |
+
datasets==2.19.1
|
11 |
+
plotly==5.22.0
|
12 |
+
ray[default]==2.23.0
|