Spaces:

derek-thomas
/

tgi-benchmark-space-optimize-inference-endpoints

Running

App Files Files Community

derek-thomas HF staff commited on Jun 1

Commit

2195fa8

•

1 Parent(s): 93d4ebf

Downloading instead of hardcoding llmperf

Browse files

Files changed (25) hide show

llmperf/.gitignore +0 -247
llmperf/LICENSE.txt +0 -202
llmperf/NOTICE.txt +0 -14
llmperf/README.md +0 -415
llmperf/analyze-token-benchmark-results.ipynb +0 -327
llmperf/llm_correctness.py +0 -309
llmperf/pre-commit.sh +0 -5
llmperf/pyproject.toml +0 -23
llmperf/requirements-dev.txt +0 -2
llmperf/src/llmperf/__init__.py +0 -1
llmperf/src/llmperf/common.py +0 -38
llmperf/src/llmperf/common_metrics.py +0 -17
llmperf/src/llmperf/models.py +0 -21
llmperf/src/llmperf/ray_clients/__init__.py +0 -0
llmperf/src/llmperf/ray_clients/litellm_client.py +0 -100
llmperf/src/llmperf/ray_clients/openai_chat_completions_client.py +0 -120
llmperf/src/llmperf/ray_clients/sagemaker_client.py +0 -158
llmperf/src/llmperf/ray_clients/vertexai_client.py +0 -135
llmperf/src/llmperf/ray_llm_client.py +0 -22
llmperf/src/llmperf/requests_launcher.py +0 -48
llmperf/src/llmperf/sonnet.txt +0 -84
llmperf/src/llmperf/utils.py +0 -147
llmperf/token_benchmark_ray.py +0 -469
on_startup.sh +6 -0
requirements.txt +7 -6

llmperf/.gitignore DELETED Viewed

@@ -1,247 +0,0 @@
-# The build output should clearly not be checked in
-*test-output.xml
-/bazel-*
-/python/ray/core
-/python/ray/pickle5_files/
-/python/ray/thirdparty_files/
-/python/ray/pyarrow_files/
-/python/ray/jars/
-/python/ray/cpp/
-/python/build
-/python/dist
-/python/python-driver-*
-/python/ray/serve/generated
-/thirdparty/pkg/
-/build/java
-.jar
-/dashboard/client/build
-# Files generated by flatc should be ignored
-/src/ray/gcs/format/*_generated.h
-/src/ray/object_manager/format/*_generated.h
-/src/ray/raylet/format/*_generated.h
-/java/runtime/src/main/java/io/ray/runtime/generated/*
-/java/serve/src/main/java/io/ray/serve/generated/*
-# Files genrated by c++ worker should be ignored.
-/cpp/example/thirdparty/
-/cpp/example/bazel-*
-/python/ray/cpp
-# Redis temporary files
-*dump.rdb
-# Python byte code files
-*.pyc
-python/.eggs
-*.egg-info
-# Backup files
-*.bak
-# Emacs temporary files
-*~
-*#
-# Compiled Object files
-*.slo
-*.lo
-*.o
-*.xo
-*.obj
-# Precompiled Headers
-*.gch
-*.pch
-# Compiled Dynamic libraries
-*.so
-*.dylib
-*.dll
-python/ray/_raylet.pyd
-# Incremental linking files
-*.ilk
-# Library export files
-*.exp
-# Debug symbols
-*.pdb
-# Fortran module files
-*.mod
-!deploy/ray-operator/go.mod
-# Compiled Static libraries
-*.lai
-*.la
-*.a
-*.lib
-# Executables
-*.exe
-*.out
-*.app
-# Visual Studio files
-/packages
-*.suo
-*.user
-*.VC.db
-*.VC.opendb
-# Protobuf-generated files
-*_pb2.py
-*.pb.h
-*.pb.cc
-# Ray cluster configuration
-scripts/nodes.txt
-# OS X folder attributes
-.DS_Store
-# Debug files
-*.dSYM/
-*.su
-# Python setup files
-*.egg-info
-# Compressed files
-*.gz
-# Datasets from examples
-**/MNIST_data/
-**/cifar-10-batches-bin/
-# Generated documentation files
-/doc/_build
-/doc/source/_static/thumbs
-/doc/source/tune/generated_guides/
-/doc/source/**/doc/
-# User-specific stuff:
-.idea/**/workspace.xml
-.idea/**/tasks.xml
-.idea/dictionaries
-.llvm-local.bazelrc
-# Sensitive or high-churn files:
-.idea/**/dataSources/
-.idea/**/dataSources.ids
-.idea/**/dataSources.xml
-.idea/**/dataSources.local.xml
-.idea/**/sqlDataSources.xml
-.idea/**/dynamic.xml
-.idea/**/uiDesigner.xml
-# Gradle:
-.idea/**/gradle.xml
-.idea/**/libraries
-.idea
-# Website
-/site/Gemfile.lock
-/site/.sass-cache
-/site/_site
-# Pytest Cache
-**/.pytest_cache
-**/.cache
-.benchmarks
-python-driver-*
-# Vscode
-.vscode/
-*.iml
-# Java
-java/**/target
-java/**/lib
-java/**/.settings
-java/**/.classpath
-java/**/.project
-java/runtime/native_dependencies/
-java/testng_custom.xml
-dependency-reduced-pom.xml
-# Cpp
-cpp/example/thirdparty/
-.clwb
-# pom.xml files generated from pom_template.xml
-java/**/pom.xml
-# python virtual env
-venv
-# pyenv version file
-.python-version
-# Vim
-.*.swp
-*.swp
-.*.swo
-*.swo
-tags
-tags.lock
-tags.temp
-*.vim
-# Emacs
-.#*
-# tools
-tools/prometheus*
-# ray project files
-project-id
-.mypy_cache/
-# release test related
-.anyscale.yaml
-test_state.json
-# workflow storage
-workflow_data/
-# vscode java extention generated
-.factorypath
-# Jupyter Notebooks
-**/.ipynb_checkpoints/
-### Added by Hedron's Bazel Compile Commands Extractor: https://github.com/hedronvision/bazel-compile-commands-extractor
-# The external link: Differs on Windows vs macOS/Linux, so we can't check it in. The pattern needs to not have a trailing / because it's a symlink on macOS/Linux.
-/external
-# Compiled output -> don't check in
-/compile_commands.json
-# Directory where clangd puts its indexing work
-/.cache/
-# Auto-generated tag mapping
-tag-mapping.json
-.bazeliskrc
-# ignore tmp files
-*.tmp
-out
-temp*
-# build output
-build/
-dist/
-# results
-output/
-*.json
-result_outputs/
-__pycache__
-**/__pycache__/

llmperf/LICENSE.txt DELETED Viewed

@@ -1,202 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-   1. Definitions.
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-   END OF TERMS AND CONDITIONS
-   APPENDIX: How to apply the Apache License to your work.
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-   Copyright [yyyy] [name of copyright owner]
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-       http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.

llmperf/NOTICE.txt DELETED Viewed

@@ -1,14 +0,0 @@
-[Project Name]
-Copyright 2023-onwards Anyscale, Inc.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.

llmperf/README.md DELETED Viewed

@@ -1,415 +0,0 @@
-# LLMPerf
-A Tool for evaulation the performance of LLM APIs.
-# Installation
-```bash
-git clone https://github.com/ray-project/llmperf.git
-cd llmperf
-pip install -e .
-```
-# Basic Usage
-We implement 2 tests for evaluating LLMs: a load test to check for performance and a correctness test to check for correctness.
-## Load test
-The load test spawns a number of concurrent requests to the LLM API and measures the inter-token latency and generation throughput per request and across concurrent requests. The prompt that is sent with each request is of the format:
-```
-Randomly stream lines from the following text. Don't generate eos tokens:
-LINE 1,
-LINE 2,
-LINE 3,
-...
-```
-Where the lines are randomly sampled from a collection of lines from Shakespeare sonnets. Tokens are counted using the `LlamaTokenizer` regardless of which LLM API is being tested. This is to ensure that the prompts are consistent across different LLM APIs.
-To run the most basic load test you can the token_benchmark_ray script.
-### Caveats and Disclaimers
-- The endpoints provider backend might vary widely, so this is not a reflection on how the software runs on a particular hardware.
-- The results may vary with time of day.
-- The results may vary with the load.
-- The results may not correlate with users’ workloads.
-### OpenAI Compatible APIs
-```bash
-export OPENAI_API_KEY=secret_abcdefg
-export OPENAI_API_BASE="https://api.endpoints.anyscale.com/v1"
-python token_benchmark_ray.py \
---model "meta-llama/Llama-2-7b-chat-hf" \
---mean-input-tokens 550 \
---stddev-input-tokens 150 \
---mean-output-tokens 150 \
---stddev-output-tokens 10 \
---max-num-completed-requests 2 \
---timeout 600 \
---num-concurrent-requests 1 \
---results-dir "result_outputs" \
---llm-api openai \
---additional-sampling-params '{}'
-```
-### Anthropic
-```bash
-export ANTHROPIC_API_KEY=secret_abcdefg
-python token_benchmark_ray.py \
---model "claude-2" \
---mean-input-tokens 550 \
---stddev-input-tokens 150 \
---mean-output-tokens 150 \
---stddev-output-tokens 10 \
---max-num-completed-requests 2 \
---timeout 600 \
---num-concurrent-requests 1 \
---results-dir "result_outputs" \
---llm-api anthropic \
---additional-sampling-params '{}'
-```
-### TogetherAI
-```bash
-export TOGETHERAI_API_KEY="YOUR_TOGETHER_KEY"
-python token_benchmark_ray.py \
---model "together_ai/togethercomputer/CodeLlama-7b-Instruct" \
---mean-input-tokens 550 \
---stddev-input-tokens 150 \
---mean-output-tokens 150 \
---stddev-output-tokens 10 \
---max-num-completed-requests 2 \
---timeout 600 \
---num-concurrent-requests 1 \
---results-dir "result_outputs" \
---llm-api "litellm" \
---additional-sampling-params '{}'
-```
-### Hugging Face
-```bash
-export HUGGINGFACE_API_KEY="YOUR_HUGGINGFACE_API_KEY"
-export HUGGINGFACE_API_BASE="YOUR_HUGGINGFACE_API_ENDPOINT"
-python token_benchmark_ray.py \
---model "huggingface/meta-llama/Llama-2-7b-chat-hf" \
---mean-input-tokens 550 \
---stddev-input-tokens 150 \
---mean-output-tokens 150 \
---stddev-output-tokens 10 \
---max-num-completed-requests 2 \
---timeout 600 \
---num-concurrent-requests 1 \
---results-dir "result_outputs" \
---llm-api "litellm" \
---additional-sampling-params '{}'
-```
-### LiteLLM
-LLMPerf can use LiteLLM to send prompts to LLM APIs. To see the environment variables to set for the provider and arguments that one should set for model and additional-sampling-params.
-see the [LiteLLM Provider Documentation](https://docs.litellm.ai/docs/providers).
-```bash
-python token_benchmark_ray.py \
---model "meta-llama/Llama-2-7b-chat-hf" \
---mean-input-tokens 550 \
---stddev-input-tokens 150 \
---mean-output-tokens 150 \
---stddev-output-tokens 10 \
---max-num-completed-requests 2 \
---timeout 600 \
---num-concurrent-requests 1 \
---results-dir "result_outputs" \
---llm-api "litellm" \
---additional-sampling-params '{}'
-```
-### Vertex AI
-Here, --model is used for logging, not for selecting the model. The model is specified in the Vertex AI Endpoint ID.
-The GCLOUD_ACCESS_TOKEN needs to be somewhat regularly set, as the token generated by `gcloud auth print-access-token` expires after 15 minutes or so.
-Vertex AI doesn't return the total number of tokens that are generated by their endpoint, so tokens are counted using the LLama tokenizer.
-```bash
-gcloud auth application-default login
-gcloud config set project YOUR_PROJECT_ID
-export GCLOUD_ACCESS_TOKEN=$(gcloud auth print-access-token)
-export GCLOUD_PROJECT_ID=YOUR_PROJECT_ID
-export GCLOUD_REGION=YOUR_REGION
-export VERTEXAI_ENDPOINT_ID=YOUR_ENDPOINT_ID
-python token_benchmark_ray.py \
---model "meta-llama/Llama-2-7b-chat-hf" \
---mean-input-tokens 550 \
---stddev-input-tokens 150 \
---mean-output-tokens 150 \
---stddev-output-tokens 10 \
---max-num-completed-requests 2 \
---timeout 600 \
---num-concurrent-requests 1 \
---results-dir "result_outputs" \
---llm-api "vertexai" \
---additional-sampling-params '{}'
-```
-### SageMaker
-SageMaker doesn't return the total number of tokens that are generated by their endpoint, so tokens are counted using the LLama tokenizer.
-```bash
-export AWS_ACCESS_KEY_ID="YOUR_ACCESS_KEY_ID"
-export AWS_SECRET_ACCESS_KEY="YOUR_SECRET_ACCESS_KEY"s
-export AWS_SESSION_TOKEN="YOUR_SESSION_TOKEN"
-export AWS_REGION_NAME="YOUR_ENDPOINTS_REGION_NAME"
-python llm_correctness.py \
---model "llama-2-7b" \
---llm-api "sagemaker" \
---max-num-completed-requests 2 \
---timeout 600 \
---num-concurrent-requests 1 \
---results-dir "result_outputs" \
-```
-see `python token_benchmark_ray.py --help` for more details on the arguments.
-## Correctness Test
-The correctness test spawns a number of concurrent requests to the LLM API with the following format:
-```
-Convert the following sequence of words into a number: {random_number_in_word_format}. Output just your final answer.
-```
-where random_number_in_word_format could be for example "one hundred and twenty three". The test then checks that the response contains that number in digit format which in this case would be 123.
-The test does this for a number of randomly generated numbers and reports the number of responses that contain a mismatch.
-To run the most basic correctness test you can run the the llm_correctness.py script.
-### OpenAI Compatible APIs
-```bash
-export OPENAI_API_KEY=secret_abcdefg
-export OPENAI_API_BASE=https://console.endpoints.anyscale.com/m/v1
-python llm_correctness.py \
---model "meta-llama/Llama-2-7b-chat-hf" \
---max-num-completed-requests 150 \
---timeout 600 \
---num-concurrent-requests 10 \
---results-dir "result_outputs"
-```
-### Anthropic
-```bash
-export ANTHROPIC_API_KEY=secret_abcdefg
-python llm_correctness.py \
---model "claude-2" \
---llm-api "anthropic"  \
---max-num-completed-requests 5 \
---timeout 600 \
---num-concurrent-requests 1 \
---results-dir "result_outputs"
-```
-### TogetherAI
-```bash
-export TOGETHERAI_API_KEY="YOUR_TOGETHER_KEY"
-python llm_correctness.py \
---model "together_ai/togethercomputer/CodeLlama-7b-Instruct" \
---llm-api "litellm" \
---max-num-completed-requests 2 \
---timeout 600 \
---num-concurrent-requests 1 \
---results-dir "result_outputs" \
-```
-### Hugging Face
-```bash
-export HUGGINGFACE_API_KEY="YOUR_HUGGINGFACE_API_KEY"
-export HUGGINGFACE_API_BASE="YOUR_HUGGINGFACE_API_ENDPOINT"
-python llm_correctness.py \
---model "huggingface/meta-llama/Llama-2-7b-chat-hf" \
---llm-api "litellm" \
---max-num-completed-requests 2 \
---timeout 600 \
---num-concurrent-requests 1 \
---results-dir "result_outputs" \
-```
-### LiteLLM
-LLMPerf can use LiteLLM to send prompts to LLM APIs. To see the environment variables to set for the provider and arguments that one should set for model and additional-sampling-params.
-see the [LiteLLM Provider Documentation](https://docs.litellm.ai/docs/providers).
-```bash
-python llm_correctness.py \
---model "meta-llama/Llama-2-7b-chat-hf" \
---llm-api "litellm" \
---max-num-completed-requests 2 \
---timeout 600 \
---num-concurrent-requests 1 \
---results-dir "result_outputs" \
-```
-see `python llm_correctness.py --help` for more details on the arguments.
-### Vertex AI
-Here, --model is used for logging, not for selecting the model. The model is specified in the Vertex AI Endpoint ID.
-The GCLOUD_ACCESS_TOKEN needs to be somewhat regularly set, as the token generated by `gcloud auth print-access-token` expires after 15 minutes or so.
-Vertex AI doesn't return the total number of tokens that are generated by their endpoint, so tokens are counted using the LLama tokenizer.
-```bash
-gcloud auth application-default login
-gcloud config set project YOUR_PROJECT_ID
-export GCLOUD_ACCESS_TOKEN=$(gcloud auth print-access-token)
-export GCLOUD_PROJECT_ID=YOUR_PROJECT_ID
-export GCLOUD_REGION=YOUR_REGION
-export VERTEXAI_ENDPOINT_ID=YOUR_ENDPOINT_ID
-python llm_correctness.py \
---model "meta-llama/Llama-2-7b-chat-hf" \
---llm-api "vertexai" \
---max-num-completed-requests 2 \
---timeout 600 \
---num-concurrent-requests 1 \
---results-dir "result_outputs" \
-```
-### SageMaker
-SageMaker doesn't return the total number of tokens that are generated by their endpoint, so tokens are counted using the LLama tokenizer.
-```bash
-export AWS_ACCESS_KEY_ID="YOUR_ACCESS_KEY_ID"
-export AWS_SECRET_ACCESS_KEY="YOUR_SECRET_ACCESS_KEY"s
-export AWS_SESSION_TOKEN="YOUR_SESSION_TOKEN"
-export AWS_REGION_NAME="YOUR_ENDPOINTS_REGION_NAME"
-python llm_correctness.py \
---model "llama-2-7b" \
---llm-api "sagemaker" \
---max-num-completed-requests 2 \
---timeout 600 \
---num-concurrent-requests 1 \
---results-dir "result_outputs" \
-```
-## Saving Results
-The results of the load test and correctness test are saved in the results directory specified by the `--results-dir` argument. The results are saved in 2 files, one with the summary metrics of the test, and one with metrics from each individual request that is returned.
-# Advanced Usage
-The correctness tests were implemented with the following workflow in mind:
-```python
-import ray
-from transformers import LlamaTokenizerFast
-from llmperf.ray_clients.openai_chat_completions_client import (
-    OpenAIChatCompletionsClient,
-)
-from llmperf.models import RequestConfig
-from llmperf.requests_launcher import RequestsLauncher
-# Copying the environment variables and passing them to ray.init() is necessary
-# For making any clients work.
-ray.init(runtime_env={"env_vars": {"OPENAI_API_BASE" : "https://api.endpoints.anyscale.com/v1",
-                                   "OPENAI_API_KEY" : "YOUR_API_KEY"}})
-base_prompt = "hello_world"
-tokenizer = LlamaTokenizerFast.from_pretrained(
-    "hf-internal-testing/llama-tokenizer"
-)
-base_prompt_len = len(tokenizer.encode(base_prompt))
-prompt = (base_prompt, base_prompt_len)
-# Create a client for spawning requests
-clients = [OpenAIChatCompletionsClient.remote()]
-req_launcher = RequestsLauncher(clients)
-req_config = RequestConfig(
-    model="meta-llama/Llama-2-7b-chat-hf",
-    prompt=prompt
-    )
-req_launcher.launch_requests(req_config)
-result = req_launcher.get_next_ready(block=True)
-print(result)
-```
-# Implementing New LLM Clients
-To implement a new LLM client, you need to implement the base class `llmperf.ray_llm_client.LLMClient` and decorate it as a ray actor.
-```python
-from llmperf.ray_llm_client import LLMClient
-import ray
-@ray.remote
-class CustomLLMClient(LLMClient):
-    def llm_request(self, request_config: RequestConfig) -> Tuple[Metrics, str, RequestConfig]:
-        """Make a single completion request to a LLM API
-        Returns:
-            Metrics about the performance charateristics of the request.
-            The text generated by the request to the LLM API.
-            The request_config used to make the request. This is mainly for logging purposes.
-        """
-        ...
-```
-# Legacy Codebase
-The old LLMPerf code base can be found in the [llmperf-legacy](https://github.com/ray-project/llmval-legacy) repo.

llmperf/analyze-token-benchmark-results.ipynb DELETED Viewed

@@ -1,327 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "56950450",
-   "metadata": {},
-   "source": [
-    "# Token Benchmark Example Analysis\n",
-    "The following is an example of the analysis that can be done on individual responses that are saved when running `token_benchmark_ray.py` with the flag `--results-dir` which enables the saving of all responses."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "dacfe98a-e81b-4089-9506-97a652993b5b",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import pandas as pd"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "17f7abe9-ed9e-466c-b034-577489aaf98b",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>error_code</th>\n",
-       "      <th>error_msg</th>\n",
-       "      <th>inter_token_latency_s</th>\n",
-       "      <th>ttft_s</th>\n",
-       "      <th>end_to_end_latency_s</th>\n",
-       "      <th>request_output_throughput_token_per_s</th>\n",
-       "      <th>number_total_tokens</th>\n",
-       "      <th>number_output_tokens</th>\n",
-       "      <th>number_input_tokens</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>NaN</td>\n",
-       "      <td></td>\n",
-       "      <td>[0.5549881670012831, 0.0009654169989510001, 0....</td>\n",
-       "      <td>0.554988</td>\n",
-       "      <td>1.610734</td>\n",
-       "      <td>44.079272</td>\n",
-       "      <td>706</td>\n",
-       "      <td>71</td>\n",
-       "      <td>635</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>NaN</td>\n",
-       "      <td></td>\n",
-       "      <td>[0.6019128750049271, 0.007011749999946, 0.0144...</td>\n",
-       "      <td>0.601913</td>\n",
-       "      <td>1.725729</td>\n",
-       "      <td>44.039357</td>\n",
-       "      <td>730</td>\n",
-       "      <td>76</td>\n",
-       "      <td>654</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   error_code error_msg                              inter_token_latency_s  \\\n",
-       "0         NaN            [0.5549881670012831, 0.0009654169989510001, 0....   \n",
-       "1         NaN            [0.6019128750049271, 0.007011749999946, 0.0144...   \n",
-       "\n",
-       "     ttft_s  end_to_end_latency_s  request_output_throughput_token_per_s  \\\n",
-       "0  0.554988              1.610734                              44.079272   \n",
-       "1  0.601913              1.725729                              44.039357   \n",
-       "\n",
-       "   number_total_tokens  number_output_tokens  number_input_tokens  \n",
-       "0                  706                    71                  635  \n",
-       "1                  730                    76                  654  "
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# path to the individual responses json file\n",
-    "df = pd.read_json('/home/ray/default/llmperf/result_outputs/550_150_individual_responses.json')\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "565a59e4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "valid_df = df[(df[\"error_code\"] != \"\")]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "102894bc",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>error_code</th>\n",
-       "      <th>error_msg</th>\n",
-       "      <th>inter_token_latency_s</th>\n",
-       "      <th>ttft_s</th>\n",
-       "      <th>end_to_end_latency_s</th>\n",
-       "      <th>request_output_throughput_token_per_s</th>\n",
-       "      <th>number_total_tokens</th>\n",
-       "      <th>number_output_tokens</th>\n",
-       "      <th>number_input_tokens</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>NaN</td>\n",
-       "      <td></td>\n",
-       "      <td>[0.5549881670012831, 0.0009654169989510001, 0....</td>\n",
-       "      <td>0.554988</td>\n",
-       "      <td>1.610734</td>\n",
-       "      <td>44.079272</td>\n",
-       "      <td>706</td>\n",
-       "      <td>71</td>\n",
-       "      <td>635</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>NaN</td>\n",
-       "      <td></td>\n",
-       "      <td>[0.6019128750049271, 0.007011749999946, 0.0144...</td>\n",
-       "      <td>0.601913</td>\n",
-       "      <td>1.725729</td>\n",
-       "      <td>44.039357</td>\n",
-       "      <td>730</td>\n",
-       "      <td>76</td>\n",
-       "      <td>654</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   error_code error_msg                              inter_token_latency_s  \\\n",
-       "0         NaN            [0.5549881670012831, 0.0009654169989510001, 0....   \n",
-       "1         NaN            [0.6019128750049271, 0.007011749999946, 0.0144...   \n",
-       "\n",
-       "     ttft_s  end_to_end_latency_s  request_output_throughput_token_per_s  \\\n",
-       "0  0.554988              1.610734                              44.079272   \n",
-       "1  0.601913              1.725729                              44.039357   \n",
-       "\n",
-       "   number_total_tokens  number_output_tokens  number_input_tokens  \n",
-       "0                  706                    71                  635  \n",
-       "1                  730                    76                  654  "
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "valid_df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "c7519fc9",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Mean number of input tokens: 644.5. Mean number of output tokens: 73.5\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "<Axes: title={'center': 'Number of Input Tokens vs. TTFT'}, xlabel='number_input_tokens', ylabel='ttft_s'>"
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAkAAAAHHCAYAAABXx+fLAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/SrBM8AAAACXBIWXMAAA9hAAAPYQGoP6dpAAA+u0lEQVR4nO3deVgW9f7/8dcNsqrgAgIqgop7uYSKYIkVbp1TVp4yWzBOmpWmRllRuWSLmll2mSeXcknLXKqv+tP0JLlUmpZmaZnghkuCogKiBgmf3x9d3MdbFsFYnefjuua6vD8z85nPe4aBlzNz37fNGGMEAABgIU4VPQAAAIDyRgACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACKsCGDRtks9m0bNmyih5KsaSkpOhf//qX6tatK5vNpqlTp1b0kCq1cePGyWazKTU1taKHAqAQBCBcs+bNmyebzSZ3d3cdO3Ys3/zu3bvruuuuq4CRVT1PPfWU1q5dq7i4OC1YsEC9e/cudFmbzaZhw4aV4+iK9vHHHxcrsOWFlitN3bt3L/MxX+uKs59LMm3YsEGHDh0qdH6XLl04vsinWkUPAChrWVlZmjhxoqZNm1bRQ6myvvrqK/Xt21fPPPNMRQ+lxD7++GPt3r1bI0eOLHK5u+++WyEhIfbXmZmZevzxx3XXXXfp7rvvtrf7+fmV1VAtY8GCBQ6vP/zwQ3355Zf52nNycuTs7HzF5Vq1aqULFy5IkgYMGKDbbrvNYb6vr68CAgI4vnBAAMI1r3379po9e7bi4uJUv379ih5OuTp37pyqV6/+t/s5ceKEatWq9fcHVIm1bdtWbdu2tb9OTU3V448/rrZt2+rBBx+swJFdey7fn999952+/PLLK+7nopY7dOiQJOmGG24otB+OLy7FLTBc81544QXl5ORo4sSJRS6Xdwl93rx5+ebZbDaNGzfO/jrvcnpCQoIefPBBeXt7y9fXV6NHj5YxRkeOHFHfvn3l5eUlf39/TZkypcBt5uTk6IUXXpC/v7+qV6+uO+64Q0eOHMm33NatW9W7d295e3vL09NTkZGR+vbbbx2WyRvTr7/+qvvvv1+1a9fWjTfeWGTNBw4c0D333KM6derI09NTXbp00apVq+zz824jGmM0ffp0+22Cksh73mnJkiV67bXX1LBhQ7m7u+vWW2/Vvn37HJbNuy25fft2RUREyMPDQ40bN9aMGTMclssbV94fvcu3tWHDBnt/q1atUlJSkn3swcHBJRr/5b766ivddNNNql69umrVqqW+fftqz549V1wvKSlJISEhuu6665SSkiJJSktL08iRIxUYGCg3NzeFhIRo0qRJys3Nta+X93P55ptvatasWWratKnc3NzUqVMnff/99w7bSE5OVkxMjBo2bCg3NzcFBASob9+++fbTpd58803ZbDYlJSXlmxcXFydXV1edOXNGkpSYmKh+/frJ399f7u7uatiwoe677z6lp6cXZ9cBlQpXgHDNa9y4saKjozV79mw9//zzpXoVqH///mrVqpUmTpyoVatW6dVXX1WdOnU0c+ZM3XLLLZo0aZI++ugjPfPMM+rUqZO6devmsP5rr70mm82m5557TidOnNDUqVMVFRWlnTt3ysPDQ9Jff3D79Omj0NBQjR07Vk5OTpo7d65uueUWff311+rcubNDn/fcc4+aNWum119/XcaYQseekpKiiIgInT9/XsOHD1fdunU1f/583XHHHVq2bJnuuusudevWTQsWLNBDDz2kHj16KDo6+qr31cSJE+Xk5KRnnnlG6enpeuONN/TAAw9o69atDsudOXNGt912m+69914NGDBAS5Ys0eOPPy5XV1f9+9//LtE2X3zxRaWnp+vo0aN6++23JUk1atS46hrWrVunPn36qEmTJho3bpwuXLigadOmqWvXrtqxY0eh4Wr//v265ZZbVKdOHX355Zfy8fHR+fPnFRkZqWPHjmnIkCFq1KiRNm/erLi4OB0/fjzfc0sff/yxzp49qyFDhshms+mNN97Q3XffrQMHDsjFxUWS1K9fP/3yyy968sknFRwcrBMnTujLL7/U4cOHCx3bvffeq2effVZLlizRqFGjHOYtWbJEPXv2VO3atZWdna1evXopKytLTz75pPz9/XXs2DH9v//3/5SWliZvb++r3q+l6fz58/kePvf29rbvI8DOANeouXPnGknm+++/N/v37zfVqlUzw4cPt8+PjIw0bdq0sb8+ePCgkWTmzp2bry9JZuzYsfbXY8eONZLMo48+am+7ePGiadiwobHZbGbixIn29jNnzhgPDw8zcOBAe9v69euNJNOgQQOTkZFhb1+yZImRZN555x1jjDG5ubmmWbNmplevXiY3N9e+3Pnz503jxo1Njx498o1pwIABxdo/I0eONJLM119/bW87e/asady4sQkODjY5OTkO9Q8dOrRY/V6+bF6trVq1MllZWfb2d955x0gyu3btsrdFRkYaSWbKlCn2tqysLNO+fXtTr149k52dbYz537E9ePCgw7bztrV+/Xp72z/+8Q8TFBRUrLFf6uTJk/mOe944Tp06ZW/76aefjJOTk4mOjra35R2LkydPmj179pj69eubTp06mdOnT9uXeeWVV0z16tVNQkKCw3aff/554+zsbA4fPmyM+d/PZd26dR3WX758uZFkVq5caYz56+dMkpk8eXKJaw0PDzehoaEObdu2bTOSzIcffmiMMebHH380kszSpUtL3P+VDB061BTnz1FRy+Xtp4KmS38e8hR0fGEt3AKDJTRp0kQPPfSQZs2apePHj5dav4MGDbL/29nZWR07dpQxRo888oi9vVatWmrRooUOHDiQb/3o6GjVrFnT/vpf//qXAgICtHr1aknSzp07lZiYqPvvv1+nTp1SamqqUlNTde7cOd16663atGmTw+0SSXrssceKNfbVq1erc+fODrfJatSooUcffVSHDh3Sr7/+WrydUEwxMTFydXW1v77pppskKd9+qVatmoYMGWJ/7erqqiFDhujEiRPavn17qY6pJI4fP66dO3fq4YcfVp06deztbdu2VY8ePezH7FK7d+9WZGSkgoODtW7dOtWuXds+b+nSpbrppptUu3Zt+3FNTU1VVFSUcnJytGnTJoe++vfv77D+5fvPw8NDrq6u2rBhg/2WVXH1799f27dv1/79++1tixcvlpubm/r27StJ9is8a9eu1fnz50vUf3l69NFH9eWXXzpM7dq1q+hhoRIiAMEyXnrpJV28ePGKzwKVRKNGjRxee3t7y93dXT4+PvnaC/qj1KxZM4fXNptNISEh9mc2EhMTJUkDBw6Ur6+vw/T+++8rKysr3/MXjRs3LtbYk5KS1KJFi3ztrVq1ss8vTZfvq7w/5pfvl/r16+d7cLt58+aSVOSzLGUtb38Uts/ygumlbr/9dtWsWVNr166Vl5eXw7zExEStWbMm33GNioqS9NeD55e60v5zc3PTpEmT9MUXX8jPz0/dunXTG2+8oeTk5CvWds8998jJyUmLFy+WJBljtHTpUvXp08c+7saNGys2Nlbvv/++fHx81KtXL02fPr3SPf/TrFkzRUVFOUyXBkcgDwEIltGkSRM9+OCDhV4FKuzh3pycnEL7vPQtukW1SSryeZzC5F3dmTx5cr7/1eZNlz/TkvfsUGVTmvvlao5VRejXr5/279+vjz76KN+83Nxc9ejRo9Dj2q9fP4fli7P/Ro4cqYSEBE2YMEHu7u4aPXq0WrVqpR9//LHIcdavX1833XSTlixZIumvd1sdPnxY/fv3d1huypQp+vnnn/XCCy/owoULGj58uNq0aaOjR48Wa38AlQkPQcNSXnrpJS1cuFCTJk3KNy/vf4lpaWkO7aV9JeRSeVd48hhjtG/fPvvbdZs2bSpJ8vLysl8ZKC1BQUHau3dvvvbffvvNPr8i/P777/nevp+QkCBJ9gd5S3KsSvqutcLk7Y/C9pmPj0++K1eTJ09WtWrV9MQTT6hmzZq6//777fOaNm2qzMzMUj+uTZs21dNPP62nn35aiYmJat++vaZMmaKFCxcWuV7//v31xBNPaO/evVq8eLE8PT11++2351vu+uuv1/XXX6+XXnpJmzdvVteuXTVjxgy9+uqrpVoHUNa4AgRLadq0qR588EHNnDkz360BLy8v+fj45Hv24j//+U+ZjefDDz/U2bNn7a+XLVum48ePq0+fPpKk0NBQNW3aVG+++aYyMzPzrX/y5Mmr3vZtt92mbdu2acuWLfa2c+fOadasWQoODlbr1q2vuu+/4+LFi5o5c6b9dXZ2tmbOnClfX1+FhoZK+l8wvPRY5eTkaNasWfn6q169eqncpgkICFD79u01f/58h+C1e/du/fe//8334XvSX+Fr1qxZ+te//qWBAwdqxYoV9nn33nuvtmzZorVr1+ZbLy0tTRcvXizR+M6fP68//vjDoa1p06aqWbOmsrKyrrh+v3795OzsrEWLFmnp0qX65z//6RDoMjIy8o3p+uuvl5OTk0P/hw8ftodooDLjChAs58UXX9SCBQu0d+9etWnTxmHeoEGDNHHiRA0aNEgdO3bUpk2b7FcfykKdOnV04403KiYmRikpKZo6dapCQkI0ePBgSZKTk5Pef/999enTR23atFFMTIwaNGigY8eOaf369fLy8tLKlSuvatvPP/+8Fi1apD59+mj48OGqU6eO5s+fr4MHD+rTTz+Vk1PF/P+ofv36mjRpkg4dOqTmzZtr8eLF2rlzp2bNmmV/K3ObNm3UpUsXxcXF6fTp06pTp44++eSTAkNDaGioFi9erNjYWHXq1Ek1atQo8MpGcUyePFl9+vRReHi4HnnkEfvb4L29vR0+J+pSTk5OWrhwoe68807de++9Wr16tW655RaNGjVKK1as0D//+U89/PDDCg0N1blz57Rr1y4tW7ZMhw4dyvcsWVESEhJ066236t5771Xr1q1VrVo1ff7550pJSdF99913xfXr1aunm2++WW+99ZbOnj2b7/bXV199pWHDhumee+5R8+bNdfHiRS1YsEDOzs4Ot+uio6O1cePGq7q1CZQnAhAsJyQkRA8++KDmz5+fb96YMWN08uRJLVu2TEuWLFGfPn30xRdfqF69emUylhdeeEE///yzJkyYoLNnz+rWW2/Vf/7zH3l6etqX6d69u7Zs2aJXXnlF7777rjIzM+Xv76+wsDCHd0uVlJ+fnzZv3qznnntO06ZN0x9//KG2bdtq5cqV+sc//lEa5V2V2rVra/78+XryySc1e/Zs+fn56d1337WHwjwfffSRhgwZookTJ6pWrVp65JFHdPPNN6tHjx4Oyz3xxBPauXOn5s6dq7fffltBQUFXHYCioqK0Zs0ajR07VmPGjJGLi4siIyM1adKkIh8+d3Fx0bJly9SnTx/17dtX69atU1hYmDZu3KjXX39dS5cu1YcffigvLy81b95cL7/8cok/VycwMFADBgxQfHy8FixYoGrVqqlly5ZasmRJvueJCtO/f3+tW7dONWvWzHdFq127durVq5dWrlypY8eOydPTU+3atdMXX3yhLl26lGisQGVgM8R0AJVE9+7dlZqaqt27d1f0UABc43gGCAAAWA4BCAAAWA4BCAAAWA7PAAEAAMvhChAAALAcAhAAALAcPgeoALm5ufr9999Vs2bNUvsYfQAAULaMMTp79qzq169/xQ9zJQAV4Pfff1dgYGBFDwMAAFyFI0eOqGHDhkUuQwAqQM2aNSX9tQO9vLwqeDQAAKA4MjIyFBgYaP87XhQCUAHybnt5eXkRgAAAqGKK8/gKD0EDAADLIQABAADLIQABAADLIQABAADLIQABAADLIQABAADLIQABAADLIQABAADLIQABAADLIQABAADL4aswAABAuTpwMlNJp88ruG51NfapXiFjIAABAIBykXY+W8MX7dSmxJP2tm7NfDVtQAd5e7qU61i4BQYAAMrF8EU79e2+VIe2b/el6slFP5b7WAhAAACgzB04malNiSeVY4xDe44x2pR4UgdTz5XreAhAAACgzCWdPl/k/EOnCEAAAOAaE1THs8j5wXXL92FoAhAAAChzTXxrqFszXznbbA7tzjabujXzLfd3gxGAAABAuZg2oIO6hvg4tHUN8dG0AR3KfSy8DR4AAJQLb08XffhIZx1MPadDp87xOUAAAMA6GvtUXPDJwy0wAABgOQQgAABgOQQgAABgOZUiAE2fPl3BwcFyd3dXWFiYtm3bVuTyaWlpGjp0qAICAuTm5qbmzZtr9erVf6tPAABgHRUegBYvXqzY2FiNHTtWO3bsULt27dSrVy+dOHGiwOWzs7PVo0cPHTp0SMuWLdPevXs1e/ZsNWjQ4Kr7BAAA1mIz5rIv5ShnYWFh6tSpk959911JUm5urgIDA/Xkk0/q+eefz7f8jBkzNHnyZP32229ycSn4m2NL2uflMjIy5O3trfT0dHl5ef2N6gAAQHkpyd/vCr0ClJ2dre3btysqKsre5uTkpKioKG3ZsqXAdVasWKHw8HANHTpUfn5+uu666/T6668rJyfnqvvMyspSRkaGwwQAAK5dFRqAUlNTlZOTIz8/P4d2Pz8/JScnF7jOgQMHtGzZMuXk5Gj16tUaPXq0pkyZoldfffWq+5wwYYK8vb3tU2BgYClUBwAAKqsKfwaopHJzc1WvXj3NmjVLoaGh6t+/v1588UXNmDHjqvuMi4tTenq6fTpy5EgpjhgAAFQ2FfpJ0D4+PnJ2dlZKSopDe0pKivz9/QtcJyAgQC4uLnJ2dra3tWrVSsnJycrOzr6qPt3c3OTm5vY3qwEAAFVFhV4BcnV1VWhoqOLj4+1tubm5io+PV3h4eIHrdO3aVfv27VNubq69LSEhQQEBAXJ1db2qPgEAgLVU+C2w2NhYzZ49W/Pnz9eePXv0+OOP69y5c4qJiZEkRUdHKy4uzr78448/rtOnT2vEiBFKSEjQqlWr9Prrr2vo0KHF7hMAAFhbhX8Zav/+/XXy5EmNGTNGycnJat++vdasWWN/iPnw4cNycvpfTgsMDNTatWv11FNPqW3btmrQoIFGjBih5557rth9AgAAa6vwzwGqjPgcIAAAqp4q8zlAAAAAFYEABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALKdSBKDp06crODhY7u7uCgsL07Zt2wpddt68ebLZbA6Tu7u7wzIpKSl6+OGHVb9+fXl6eqp3795KTEws6zIAAEAVUeEBaPHixYqNjdXYsWO1Y8cOtWvXTr169dKJEycKXcfLy0vHjx+3T0lJSfZ5xhjdeeedOnDggJYvX64ff/xRQUFBioqK0rlz58qjJAAAUMlVeAB66623NHjwYMXExKh169aaMWOGPD09NWfOnELXsdls8vf3t09+fn72eYmJifruu+/03nvvqVOnTmrRooXee+89XbhwQYsWLSqPkgAAQCVXoQEoOztb27dvV1RUlL3NyclJUVFR2rJlS6HrZWZmKigoSIGBgerbt69++eUX+7ysrCxJcrgt5uTkJDc3N33zzTcF9peVlaWMjAyHCQAAXLsqNAClpqYqJyfH4QqOJPn5+Sk5ObnAdVq0aKE5c+Zo+fLlWrhwoXJzcxUREaGjR49Kklq2bKlGjRopLi5OZ86cUXZ2tiZNmqSjR4/q+PHjBfY5YcIEeXt726fAwMDSLRQAAFQqFX4LrKTCw8MVHR2t9u3bKzIyUp999pl8fX01c+ZMSZKLi4s+++wzJSQkqE6dOvL09NT69evVp08fOTkVXG5cXJzS09Pt05EjR8qzJAAAUM6qVeTGfXx85OzsrJSUFIf2lJQU+fv7F6sPFxcXdejQQfv27bO3hYaGaufOnUpPT1d2drZ8fX0VFhamjh07FtiHm5ub3Nzcrr4QAABQpVToFSBXV1eFhoYqPj7e3pabm6v4+HiFh4cXq4+cnBzt2rVLAQEB+eZ5e3vL19dXiYmJ+uGHH9S3b99SGzsAAKi6KvQKkCTFxsZq4MCB6tixozp37qypU6fq3LlziomJkSRFR0erQYMGmjBhgiRp/Pjx6tKli0JCQpSWlqbJkycrKSlJgwYNsve5dOlS+fr6qlGjRtq1a5dGjBihO++8Uz179qyQGgEAQOVS4QGof//+OnnypMaMGaPk5GS1b99ea9assT8YffjwYYdnd86cOaPBgwcrOTlZtWvXVmhoqDZv3qzWrVvblzl+/LhiY2OVkpKigIAARUdHa/To0eVeGwAAqJxsxhhT0YOobDIyMuTt7a309HR5eXlV9HAAAEAxlOTvd5V7FxgAAMDfRQACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWQwACAACWc1UBaMeOHdq1a5f99fLly3XnnXfqhRdeUHZ2don7mz59uoKDg+Xu7q6wsDBt27at0GXnzZsnm83mMLm7uzssk5mZqWHDhqlhw4by8PBQ69atNWPGjBKPCwAAXJuuKgANGTJECQkJkqQDBw7ovvvuk6enp5YuXapnn322RH0tXrxYsbGxGjt2rHbs2KF27dqpV69eOnHiRKHreHl56fjx4/YpKSnJYX5sbKzWrFmjhQsXas+ePRo5cqSGDRumFStWlLxYAABwzbmqAJSQkKD27dtLkpYuXapu3brp448/1rx58/Tpp5+WqK+33npLgwcPVkxMjP1Kjaenp+bMmVPoOjabTf7+/vbJz8/PYf7mzZs1cOBAde/eXcHBwXr00UfVrl27Iq8sAQAA67iqAGSMUW5uriRp3bp1uu222yRJgYGBSk1NLXY/2dnZ2r59u6Kiov43ICcnRUVFacuWLYWul5mZqaCgIAUGBqpv37765ZdfHOZHRERoxYoVOnbsmIwxWr9+vRISEtSzZ88C+8vKylJGRobDBAAArl1XFYA6duyoV199VQsWLNDGjRv1j3/8Q5J08ODBfFdjipKamqqcnJx86/j5+Sk5ObnAdVq0aKE5c+Zo+fLlWrhwoXJzcxUREaGjR4/al5k2bZpat26thg0bytXVVb1799b06dPVrVu3AvucMGGCvL297VNgYGCxawAAAFXPVQWgqVOnaseOHRo2bJhefPFFhYSESJKWLVumiIiIUh3g5cLDwxUdHa327dsrMjJSn332mXx9fTVz5kz7MtOmTdN3332nFStWaPv27ZoyZYqGDh2qdevWFdhnXFyc0tPT7dORI0fKtAYAAFCxql3NSm3btnV4F1ieyZMny9nZ2f560aJFuuOOO1S9evUC+/Hx8ZGzs7NSUlIc2lNSUuTv71+ssbi4uKhDhw7at2+fJOnChQt64YUX9Pnnn9uvTLVt21Y7d+7Um2++6XC7LY+bm5vc3NyKtT0AAFD1lernALm7u8vFxcX+esiQIfnCzaVcXV0VGhqq+Ph4e1tubq7i4+MVHh5erG3m5ORo165dCggIkCT9+eef+vPPP+Xk5Fias7Oz/bklAABgbVd1Bai4jDFXXCY2NlYDBw5Ux44d1blzZ02dOlXnzp1TTEyMJCk6OloNGjTQhAkTJEnjx49Xly5dFBISorS0NE2ePFlJSUkaNGiQpL/eIh8ZGalRo0bJw8NDQUFB2rhxoz788EO99dZbZVcsAACoMso0ABVH//79dfLkSY0ZM0bJyclq37691qxZY38w+vDhww5Xc86cOaPBgwcrOTlZtWvXVmhoqDZv3qzWrVvbl/nkk08UFxenBx54QKdPn1ZQUJBee+01PfbYY+VeHwAAqHxspjiXaa5SzZo19dNPP6lJkyZltYkykZGRIW9vb6Wnp8vLy6uihwMAAIqhJH+/+S4wAABgOQQgAABgOWUagIKCghzeFQYAAFAZXFUAatKkiU6dOpWvPS0tzeF5n927d/OpygAAoNK5qgB06NAh5eTk5GvPysrSsWPH/vagAAAAylKJ3ga/YsUK+7/Xrl0rb29v++ucnBzFx8crODi41AYHAABQFkoUgO688077vwcOHOgwz8XFRcHBwZoyZUqpDAwAAKCsFDsA/fzzz/rzzz/l7Oysxo0b6/vvv5ePj09Zjg0AAKBMFPsZoA4dOuj06dOSJJvNJpvNVmaDAgAAKEvFDkC1atXSgQMHJElJSUl8sSgAAKiyin0LrF+/foqMjLR/63rHjh3l7Oxc4LJ5QQkAAKAyKnYAmjVrlu6++27t27dPw4cP1+DBg1WzZs2yHBsAAECZKNG7wHr37i1J2r59u0aMGEEAAgAAVdJVfRBiYQ9Anzt3Tv/+97//1oAAAADK2lUFoPnz5+vChQv52i9cuKAPP/zwbw8KAACgLJXoFlhGRoaMMTLG6OzZs3J3d7fPy8nJ0erVq1WvXr1SHyQAAEBpKlEAqlWrlv0zgJo3b55vvs1m08svv1xqgwMAACgLJQpA69evlzFGt9xyi5YtW6a6deva57m6uiooKEgXL14s9UECAACUphIFoMjISPu/w8PD7Z8JlOfUqVMKDAws8JviAQAAKoureghakqpVy5+dMjMzHZ4LAgAAqIxKdAUoNjZW0l/P+owePVqenp72eTk5Odq6davat29fqgMEAAAobSUKQD/++KMkyRijXbt2ydXV1T7P1dVV7dq10zPPPFO6IwQAAChlJX4IWpJiYmL0zjvvyMvLq0wGBQAAUJZKFIDyzJ07t7THAQAAUG6u+iFoAACAqooABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALIcABAAALKdSBKDp06crODhY7u7uCgsL07Zt2wpddt68ebLZbA6Tu7u7wzKXz8+bJk+eXNalAACAKqDCA9DixYsVGxursWPHaseOHWrXrp169eqlEydOFLqOl5eXjh8/bp+SkpIc5l867/jx45ozZ45sNpv69etX1uUAAIAqoMID0FtvvaXBgwcrJiZGrVu31owZM+Tp6ak5c+YUuo7NZpO/v7998vPzc5h/6Tx/f38tX75cN998s5o0aVLW5QAAgCqgQgNQdna2tm/frqioKHubk5OToqKitGXLlkLXy8zMVFBQkAIDA9W3b1/98ssvhS6bkpKiVatW6ZFHHil0maysLGVkZDhMAADg2lWhASg1NVU5OTn5ruD4+fkpOTm5wHVatGihOXPmaPny5Vq4cKFyc3MVERGho0ePFrj8/PnzVbNmTd19992FjmPChAny9va2T4GBgVdfFAAAqPQq/BZYSYWHhys6Olrt27dXZGSkPvvsM/n6+mrmzJkFLj9nzhw98MAD+R6UvlRcXJzS09Pt05EjR8pq+AAAoBKoVpEb9/HxkbOzs1JSUhzaU1JS5O/vX6w+XFxc1KFDB+3bty/fvK+//lp79+7V4sWLi+zDzc1Nbm5uxR84AACo0ir0CpCrq6tCQ0MVHx9vb8vNzVV8fLzCw8OL1UdOTo527dqlgICAfPM++OADhYaGql27dqU2ZgAAUPVV6BUgSYqNjdXAgQPVsWNHde7cWVOnTtW5c+cUExMjSYqOjlaDBg00YcIESdL48ePVpUsXhYSEKC0tTZMnT1ZSUpIGDRrk0G9GRoaWLl2qKVOmlHtNAACgcqvwANS/f3+dPHlSY8aMUXJystq3b681a9bYH4w+fPiwnJz+d6HqzJkzGjx4sJKTk1W7dm2FhoZq8+bNat26tUO/n3zyiYwxGjBgQLnWAwAAKj+bMcZU9CAqm4yMDHl7eys9PV1eXl4VPRwAAFAMJfn7XeXeBQYAAPB3EYAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlEIAAAIDlVIoANH36dAUHB8vd3V1hYWHatm1bocvOmzdPNpvNYXJ3d8+33J49e3THHXfI29tb1atXV6dOnXT48OGyLAMAAFQRFR6AFi9erNjYWI0dO1Y7duxQu3bt1KtXL504caLQdby8vHT8+HH7lJSU5DB///79uvHGG9WyZUtt2LBBP//8s0aPHl1gUAIAANZjM8aYihxAWFiYOnXqpHfffVeSlJubq8DAQD355JN6/vnn8y0/b948jRw5UmlpaYX2ed9998nFxUULFiy4qjFlZGTI29tb6enp8vLyuqo+AABA+SrJ3+8KvQKUnZ2t7du3Kyoqyt7m5OSkqKgobdmypdD1MjMzFRQUpMDAQPXt21e//PKLfV5ubq5WrVql5s2bq1evXqpXr57CwsL0f//3f4X2l5WVpYyMDIcJAABcuyo0AKWmpionJ0d+fn4O7X5+fkpOTi5wnRYtWmjOnDlavny5Fi5cqNzcXEVEROjo0aOSpBMnTigzM1MTJ05U79699d///ld33XWX7r77bm3cuLHAPidMmCBvb2/7FBgYWLqFAgCASqVaRQ+gpMLDwxUeHm5/HRERoVatWmnmzJl65ZVXlJubK0nq27evnnrqKUlS+/bttXnzZs2YMUORkZH5+oyLi1NsbKz9dUZGBiEIAIBrWIUGIB8fHzk7OyslJcWhPSUlRf7+/sXqw8XFRR06dNC+ffvsfVarVk2tW7d2WK5Vq1b65ptvCuzDzc1Nbm5uV1EBAACoiir0Fpirq6tCQ0MVHx9vb8vNzVV8fLzDVZ6i5OTkaNeuXQoICLD32alTJ+3du9dhuYSEBAUFBZXe4AEAQJVV4bfAYmNjNXDgQHXs2FGdO3fW1KlTde7cOcXExEiSoqOj1aBBA02YMEGSNH78eHXp0kUhISFKS0vT5MmTlZSUpEGDBtn7HDVqlPr3769u3brp5ptv1po1a7Ry5Upt2LChIkoEAACVTIUHoP79++vkyZMaM2aMkpOT1b59e61Zs8b+YPThw4fl5PS/C1VnzpzR4MGDlZycrNq1ays0NFSbN292uOV11113acaMGZowYYKGDx+uFi1a6NNPP9WNN95Y7vUBAIDKp8I/B6gy4nOAAACoeqrM5wABAABUBAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwnGoVPQCrOXAyU0mnzyu4bnU19qle0cMBAMCSCEDlJO18toYv2qlNiSftbd2a+WragA7y9nSpwJEBAGA93AIrJ8MX7dS3+1Id2r7dl6onF/1YQSMCAMC6CEDl4MDJTG1KPKkcYxzac4zRpsSTOph6roJGBgCANRGAykHS6fNFzj90igAEAEB5IgCVg6A6nkXOD67Lw9AAAJQnAlA5aOJbQ92a+crZZnNod7bZ1K2ZL+8GAwCgnBGAysm0AR3UNcTHoa1riI+mDehQQSMCAMC6eBt8OfH2dNGHj3TWwdRzOnTqHJ8DBABABSIAlbPGPgQfAAAqGrfAAACA5RCAAACA5RCAAACA5RCAAACA5RCAAACA5RCAAACA5RCAAACA5RCAAACA5RCAAACA5RCAAACA5fBVGAUwxkiSMjIyKngkAACguPL+buf9HS8KAagAZ8+elSQFBgZW8EgAAEBJnT17Vt7e3kUuYzPFiUkWk5ubq99//101a9aUzWYr1b4zMjIUGBioI0eOyMvLq1T7rmyo9dplpXqp9dplpXqtUqsxRmfPnlX9+vXl5FT0Uz5cASqAk5OTGjZsWKbb8PLyuqZ/CC9FrdcuK9VLrdcuK9VrhVqvdOUnDw9BAwAAyyEAAQAAyyEAlTM3NzeNHTtWbm5uFT2UMket1y4r1Uut1y4r1WulWouLh6ABAIDlcAUIAABYDgEIAABYDgEIAABYDgEIAABYDgGoBI4dO6YHH3xQdevWlYeHh66//nr98MMP9vnjxo1Ty5YtVb16ddWuXVtRUVHaunWrQx/BwcGy2WwO08SJE4vc7h9//KGhQ4eqbt26qlGjhvr166eUlJQyqTHP3611w4YN+erMm77//vtCt9u9e/d8yz/22GNlWqt05Xov9dhjj8lms2nq1KkO7adPn9YDDzwgLy8v1apVS4888ogyMzOL3G5lPLaXKqjWQ4cO6ZFHHlHjxo3l4eGhpk2bauzYscrOzi5yuxVxbEvjuFaVc1b6+/VWpfP2SrU+/PDD+cbUu3dvhz6ulXP2SrVWpXO2PPFJ0MV05swZde3aVTfffLO++OIL+fr6KjExUbVr17Yv07x5c7377rtq0qSJLly4oLfffls9e/bUvn375Ovra19u/PjxGjx4sP11zZo1i9z2U089pVWrVmnp0qXy9vbWsGHDdPfdd+vbb78t/UJVOrVGRETo+PHjDv2OHj1a8fHx6tixY5HbHzx4sMaPH29/7enpWboFXqY49eb5/PPP9d1336l+/fr55j3wwAM6fvy4vvzyS/3555+KiYnRo48+qo8//rjQbVfGY5unsFp/++035ebmaubMmQoJCdHu3bs1ePBgnTt3Tm+++WaR2y/PY1tax1Wq/OesVDr1VpXztri19u7dW3PnzrW/vvwt4NfSOVtUrVXlnC13BsXy3HPPmRtvvLFE66SnpxtJZt26dfa2oKAg8/bbbxe7j7S0NOPi4mKWLl1qb9uzZ4+RZLZs2VKi8RRXadV6qezsbOPr62vGjx9fZD+RkZFmxIgRJdr231Xceo8ePWoaNGhgdu/ene84/vrrr0aS+f777+1tX3zxhbHZbObYsWMF9leZj21RtRbkjTfeMI0bNy5ymfI+tqVVa1U4Z40pm2NbWc/b4tQ6cOBA07dv30LnX0vn7JVqLUhlPGfLG7fAimnFihXq2LGj7rnnHtWrV08dOnTQ7NmzC10+Oztbs2bNkre3t9q1a+cwb+LEiapbt646dOigyZMn6+LFi4X2s337dv3555+Kioqyt7Vs2VKNGjXSli1b/n5hBSjNWi/t89SpU4qJibni9j/66CP5+PjouuuuU1xcnM6fP3/VtRRHcerNzc3VQw89pFGjRqlNmzb5+tiyZYtq1arl8L/kqKgoOTk55bsNmqeyHtsr1VqQ9PR01alT54rLleexLc1aK/s5K5XNsa2s521xf0dt2LBB9erVU4sWLfT444/r1KlT9nnX0jkrFV1rQSrjOVvuKjqBVRVubm7Gzc3NxMXFmR07dpiZM2cad3d3M2/ePIflVq5caapXr25sNpupX7++2bZtm8P8KVOmmPXr15uffvrJvPfee6ZWrVrmqaeeKnS7H330kXF1dc3X3qlTJ/Pss8+WTnGXKa1aL9WnTx/Tp0+fK2575syZZs2aNebnn382CxcuNA0aNDB33XXX366pKMWp9/XXXzc9evQwubm5xpj8VwVee+0107x583x9+/r6mv/85z8FbreyHtsr1Xq5xMRE4+XlZWbNmlXktsv72JZWrVXhnDWmbI5tZT1vi1ProkWLzPLly83PP/9sPv/8c9OqVSvTqVMnc/HiRWPMtXXOXqnWy1XWc7a8EYCKycXFxYSHhzu0Pfnkk6ZLly4ObZmZmSYxMdFs2bLF/Pvf/zbBwcEmJSWl0H4/+OADU61aNfPHH38UOL8iTrjSrvXIkSPGycnJLFu2rMRjiY+PN5LMvn37SrxucV2p3h9++MH4+fk5XBavqgGoNGq91NGjR03Tpk3NI488UuKxlPWxLe1a81TGc9aY0q+3Mp+3xf0ddan9+/c73Ka/Vs7Zglxe66Uq8zlb3rgFVkwBAQFq3bq1Q1urVq10+PBhh7bq1asrJCREXbp00QcffKBq1arpgw8+KLTfsLAwXbx4UYcOHSpwvr+/v7Kzs5WWlubQnpKSIn9//6uq5UpKu9a5c+eqbt26uuOOO0o8lrCwMEnSvn37SrxucV2p3q+//lonTpxQo0aNVK1aNVWrVk1JSUl6+umnFRwcLOmv43TixAmHPi5evKjTp08Xepwq47EtTq15fv/9d918882KiIjQrFmzSjyWsj62pVnr5eOubOesVPr1Vubztri/oy7VpEkT+fj42Md0rZyzBbm81jyV/ZwtbwSgYuratav27t3r0JaQkKCgoKAi18vNzVVWVlah83fu3CknJyfVq1evwPmhoaFycXFRfHy8vW3v3r06fPiwwsPDS1BB8ZVmrcYYzZ07V9HR0XJxcSnxWHbu3Cnpr18CZeVK9T700EP6+eeftXPnTvtUv359jRo1SmvXrpUkhYeHKy0tTdu3b7f38dVXXyk3N9f+S+NylfHYFqdW6a+35Xbv3l2hoaGaO3eunJxK/qukrI9tadVa0Lgr2zkrlW69lf28vZrfUUePHtWpU6fsY7pWztmCXF6rVDXO2XJX0Zegqopt27aZatWqmddee80kJiaajz76yHh6epqFCxcaY/66HRQXF2e2bNliDh06ZH744QcTExNj3NzczO7du40xxmzevNm8/fbbZufOnWb//v1m4cKFxtfX10RHR9u3c/ToUdOiRQuzdetWe9tjjz1mGjVqZL766ivzww8/mPDw8HyXRCtbrXnWrVtnJJk9e/bk287lte7bt8+MHz/e/PDDD+bgwYNm+fLlpkmTJqZbt25lVmtx6i1IQbcOevfubTp06GC2bt1qvvnmG9OsWTMzYMAA+/yqcGwLcnmtR48eNSEhIebWW281R48eNcePH7dPly5T0ce2NGqtKuesMaX3c2xM5T9vr1Tr2bNnzTPPPGO2bNliDh48aNatW2duuOEG06xZM4dbl9fCOVucWqvKOVveCEAlsHLlSnPdddcZNzc307JlS4cHyC5cuGDuuusuU79+fePq6moCAgLMHXfc4fBg8Pbt201YWJjx9vY27u7uplWrVub11193OCEPHjxoJJn169c79P3EE0+Y2rVrG09PT3PXXXc5/OBWxlrzDBgwwERERBS4jctrPXz4sOnWrZupU6eOcXNzMyEhIWbUqFEmPT29TGq8VFH1FqSgPxynTp0yAwYMMDVq1DBeXl4mJibGnD171j6/Khzbglxe69y5c42kAqc8leXY/t1aq9I5a0zp/BwbUzXO26JqPX/+vOnZs6fx9fU1Li4uJigoyAwePNgkJyc79HEtnLPFqbUqnbPlyWaMMeV5xQkAAKCi8QwQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQAACwHAIQgL+te/fuGjlyZLlv9+GHH9add95Z7tstT8HBwZo6dWpFDwO45hCAAFRZ77zzjubNm1fu2503b55q1apVonUIMkDlUq2iBwAABcnJyZHNZivySxu9vb3LcUQAriVcAQKuId27d9fw4cP17LPPqk6dOvL399e4ceMkSYcOHZLNZrN/o7MkpaWlyWazacOGDZKkDRs2yGazae3aterQoYM8PDx0yy236MSJE/riiy/UqlUreXl56f7779f58+cdtn3x4kUNGzZM3t7e8vHx0ejRo3XpN+1kZWXpmWeeUYMGDVS9enWFhYXZtyv976rKihUr1Lp1a7m5uenw4cNF1nv5LbCi6s9js9n03nvvqU+fPvLw8FCTJk20bNky+/y8fZCWlmZv27lzp2w2mw4dOqQNGzYoJiZG6enpstlsstls+bZxue7duyspKUlPPfWUfZ08n376qdq0aSM3NzcFBwdrypQpRfb1/vvvq1atWvZvJN+9e7f69OmjGjVqyM/PTw899JBSU1OLvU+MMRo3bpwaNWokNzc31a9fX8OHDy9yDMA1oWK/igxAaYqMjDReXl5m3LhxJiEhwcyfP9/YbDbz3//+1/5lhz/++KN9+TNnzjh8AeL69euNJNOlSxfzzTffmB07dpiQkBATGRlpevbsaXbs2GE2bdpk6tatayZOnOiw3Ro1apgRI0aY3377zSxcuNB4eno6fGnjoEGDTEREhNm0aZPZt2+fmTx5snFzczMJCQnGmL++sNHFxcVERESYb7/91vz222/m3LlzRdY7cOBA07dv32LVn0eSqVu3rpk9e7bZu3eveemll4yzs7P59ddfHfbBmTNn7Ov8+OOPRpI5ePCgycrKMlOnTjVeXl72b9S+9As0C3Lq1CnTsGFDM378eIdv4f7hhx+Mk5OTGT9+vNm7d6+ZO3eu8fDwMHPnzrWve+kXlk6aNMnUrVvX/o3dZ86cMb6+viYuLs7s2bPH7Nixw/To0cPcfPPNxd4nS5cuNV5eXmb16tUmKSnJbN269YpfogpcCwhAwDUkMjLS3HjjjQ5tnTp1Ms8991yJAtC6devsy0yYMMFIMvv377e3DRkyxPTq1cthu61atTK5ubn2tueee860atXKGGNMUlKScXZ2NseOHXMY26233mri4uKMMf/7xuqdO3cWu96CAlBh9eeRZB577DGHZcLCwszjjz/usA8KC0B5Y/X29i72OI0p+JvX77//ftOjRw+HtlGjRpnWrVvnW+/ZZ581AQEBZvfu3fZ5r7zyiunZs6fD+keOHDGSzN69e40xV94nU6ZMMc2bNzfZ2dklqgeo6rgFBlxj2rZt6/A6ICBAJ06cuOo+/Pz85OnpqSZNmji0Xd5nly5dHG7thIeHKzExUTk5Odq1a5dycnLUvHlz1ahRwz5t3LhR+/fvt6/j6uqab/wlVZz6w8PD873es2fP39ru1dizZ4+6du3q0Na1a1f7fsszZcoUzZ49W998843atGljb//pp5+0fv16h33asmVLSXLYr0Xtk3vuuUcXLlxQkyZNNHjwYH3++ee6ePFiqdcKVDY8BA1cY1xcXBxe22w25ebm2h8mNpc8l/Pnn39esQ+bzVZon8WVmZkpZ2dnbd++Xc7Ozg7zatSoYf+3h4eHQ4i6Gn93rCXZT+Xlpptu0qpVq7RkyRI9//zz9vbMzEzdfvvtmjRpUr51AgIC7P8uap8EBgZq7969Wrdunb788ks98cQTmjx5sjZu3JhvPeBaQgACLMLX11eSdPz4cXXo0EGSHB6I/ru2bt3q8Pq7775Ts2bN5OzsrA4dOignJ0cnTpzQTTfdVGrbvFrfffedoqOjHV7n7ZNL91Pt2rUl5d9Prq6uDldoiqOgdVq1aqVvv/3Woe3bb79V8+bNHYJi586dNWzYMPXu3VvVqlXTM888I0m64YYb9Omnnyo4OFjVql39r3MPDw/dfvvtuv322zV06FC1bNlSu3bt0g033HDVfQKVHbfAAIvw8PBQly5dNHHiRO3Zs0cbN27USy+9VGr9Hz58WLGxsdq7d68WLVqkadOmacSIEZKk5s2b64EHHlB0dLQ+++wzHTx4UNu2bdOECRO0atWqUhtDcS1dulRz5sxRQkKCxo4dq23btmnYsGGSpJCQEAUGBmrcuHFKTEzUqlWr8r0zKzg4WJmZmYqPj1dqamq+d8QVJDg4WJs2bdKxY8fs79J6+umnFR8fr1deeUUJCQmaP3++3n33XXvAuVRERIRWr16tl19+2f55QkOHDtXp06c1YMAAff/999q/f7/Wrl2rmJiYYge0efPm6YMPPtDu3bt14MABLVy4UB4eHgoKCirW+kBVRQACLGTOnDm6ePGiQkNDNXLkSL366qul1nd0dLQuXLigzp07a+jQoRoxYoQeffRR+/y5c+cqOjpaTz/9tFq0aKE777xT33//vRo1alRqYyiul19+WZ988onatm2rDz/8UIsWLVLr1q0l/XW7aNGiRfrtt9/Utm1bTZo0Kd9+ioiI0GOPPab+/fvL19dXb7zxxhW3OX78eB06dEhNmza1X2W64YYbtGTJEn3yySe67rrrNGbMGI0fP14PP/xwgX3ceOONWrVqlV566SVNmzZN9evX17fffqucnBz17NlT119/vUaOHKlatWoV+flJl6pVq5Zmz56trl27qm3btlq3bp1WrlypunXrFmt9oKqymUtvdAPANc5ms+nzzz+/5r9CA0DRuAIEAAAshwAEoNK69O3dl09ff/11RQ/P7uuvvy5yrAAqH26BAai09u3bV+i8Bg0ayMPDoxxHU7gLFy7o2LFjhc4PCQkpx9EAKA4CEAAAsBxugQEAAMshAAEAAMshAAEAAMshAAEAAMshAAEAAMshAAEAAMshAAEAAMshAAEAAMv5//KcgIuL/M9kAAAAAElFTkSuQmCC",
-      "text/plain": [
-       "<Figure size 640x480 with 1 Axes>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "final_df = pd.DataFrame()\n",
-    "final_df[\"number_input_tokens\"] = valid_df[\"number_input_tokens\"]\n",
-    "final_df[\"number_output_tokens\"] = valid_df[\"number_output_tokens\"]\n",
-    "final_df[\"ttft_s\"] = valid_df[\"ttft_s\"]\n",
-    "final_df[\"end_to_end_latency_s\"] = valid_df[\"end_to_end_latency_s\"]\n",
-    "final_df[\"generation_throughput\"] = valid_df[\"request_output_throughput_token_per_s\"]\n",
-    "\n",
-    "mean_tokens_in = final_df[\"number_input_tokens\"].mean()\n",
-    "mean_tokens_out = valid_df[\"number_output_tokens\"].mean()\n",
-    "print(f\"Mean number of input tokens: {mean_tokens_in}. Mean number of output tokens: {mean_tokens_out}\")\n",
-    "final_df.plot.scatter(x=\"number_input_tokens\", y=\"ttft_s\", title=\"Number of Input Tokens vs. TTFT\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "a14de79c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<Axes: title={'center': 'Token Latencies'}, ylabel='Frequency'>"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAGzCAYAAADT4Tb9AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/SrBM8AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAvAklEQVR4nO3de1TUVb/H8c+AAl7wUioocsRILTO19EiEVhaJ6aHMfLyVIpkdE08m6VNmiaaJWZKdMk3zkqdj+pSXWqWWktbpaI8nL1k9XlIjvIGQFxQTEPb5w+U8TWDCODCwfb/WmrWaPXv/5rt31HzW77d/Mw5jjBEAAIAlfLxdAAAAgCcRbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuABTjcDg0cuRIb5dxVXA4HJo4caK3ywCsQrgBLOFwOEr12Lhxo7dLLZO77rpLbdq08cixNm3apIkTJ+rkyZMeOR6AyqmatwsA4Bn/9V//5fJ88eLFWrduXbH2G2+8sSLLqlQ2bdqkSZMmaciQIapXr563y5Ek/fbbb6pWjf8VA57Ef1GAJR555BGX5998843WrVtXrB2VS0BAgLdLAKzDZSngKpKbm6unn35aoaGh8vf3V6tWrfTqq6/KGHPZsVOmTJGPj4/eeOMNZ9uaNWvUpUsX1apVS4GBgerZs6d+/PFHl3FDhgxR7dq1dfjwYfXq1Uu1a9dWw4YNNWbMGBUWFnpkXjt37tSQIUN03XXXKSAgQMHBwXr00Uf166+/OvtMnDhRY8eOlSQ1b97ceZkuLS3N2ee9995Thw4dVKNGDV1zzTXq37+/Dh486PJeFy+T/eMf/1DXrl1Vs2ZNhYSEaPr06cXqOnfunCZOnKiWLVsqICBAjRs3Vu/evbV//35nn5L23Bw+fFiPPvqogoKC5O/vr5tuukkLFiwodvw33nhDN910k2rWrKn69eurY8eOWrJkiTtLCFiFMzfAVcIYo/vvv18bNmzQ0KFD1b59e3322WcaO3asDh8+rNdee+2SY59//nlNnTpVb7/9toYNGybpwmWwuLg4xcTE6OWXX9bZs2c1e/Zsde7cWdu3b1dYWJhzfGFhoWJiYhQREaFXX31V69ev14wZMxQeHq4nnnjiiue2bt06HThwQPHx8QoODtaPP/6ouXPn6scff9Q333wjh8Oh3r17a+/evXr//ff12muvqUGDBpKkhg0bSpJeeuklvfDCC+rbt68ee+wxZWVl6Y033tAdd9yh7du3u1zGOnHihLp3767evXurb9+++vDDD/XMM8/o5ptv1n333eec87/9278pNTVV/fv316hRo3T69GmtW7dOP/zwg8LDw0ucS2Zmpm677Tbnpu6GDRtqzZo1Gjp0qHJycvTUU09JkubNm6cnn3xSffr00ahRo3Tu3Dnt3LlTf//73zVw4MArXlOgSjMArJSQkGB+/5/4qlWrjCQzZcoUl359+vQxDofD7Nu3z9kmySQkJBhjjHn66aeNj4+PWbRokfP106dPm3r16plhw4a5HCsjI8PUrVvXpT0uLs5IMi+++KJL31tuucV06NDhsvO48847zU033fSnfc6ePVus7f333zeSzFdffeVse+WVV4wk8/PPP7v0TUtLM76+vuall15yaf/+++9NtWrVXNrvvPNOI8ksXrzY2ZaXl2eCg4PNQw895GxbsGCBkWRSUlKK1VZUVOT8Z0kmKSnJ+Xzo0KGmcePGJjs722VM//79Td26dZ1zfeCBBy67LsDVistSwFVi9erV8vX11ZNPPunS/vTTT8sYozVr1ri0G2M0cuRIvf7663rvvfcUFxfnfG3dunU6efKkBgwYoOzsbOfD19dXERER2rBhQ7H3Hz58uMvzLl266MCBAx6ZW40aNZz/fO7cOWVnZ+u2226TJG3btu2y41esWKGioiL17dvXZT7BwcFq0aJFsfnUrl3bZS+Tn5+fOnXq5DKf5cuXq0GDBvqP//iPYu/ncDhKrMMYo+XLlys2NlbGGJdaYmJidOrUKed86tWrp0OHDun//u//Ljs/4GrDZSngKvHLL7+oSZMmCgwMdGm/ePfUL7/84tK+ePFinTlzRrNnz9aAAQNcXvvpp58kSXfffXeJ71WnTh2X5wEBAc7LPxfVr19fJ06cKPtESnD8+HFNmjRJS5cu1bFjx1xeO3Xq1GXH//TTTzLGqEWLFiW+Xr16dZfnTZs2LRZQ6tevr507dzqf79+/X61atSrTnVBZWVk6efKk5s6dq7lz55bY5+L8nnnmGa1fv16dOnXS9ddfr27dumngwIGKiooq9fsBtiLcAChRVFSUduzYoTfffFN9+/bVNddc43ytqKhI0oV9N8HBwcXG/vED3dfXt1xr7du3rzZt2qSxY8eqffv2ql27toqKitS9e3dnrX+mqKhIDodDa9asKbHW2rVruzy/1HxMKTZmX64O6cKdb78/U/Z7bdu2lXQhlO7Zs0effPKJ1q5dq+XLl+utt97ShAkTNGnSpCuqA6jqCDfAVaJZs2Zav369Tp8+7XL2Zvfu3c7Xf+/666/X9OnTddddd6l79+5KTU11jru4GbZRo0aKjo6uoBmU7MSJE0pNTdWkSZM0YcIEZ/vFs0u/d6nLQeHh4TLGqHnz5mrZsqVH6goPD9ff//53FRQUFDvzcykNGzZUYGCgCgsLS7WutWrVUr9+/dSvXz/l5+erd+/eeumllzRu3DhuMcdVjT03wFWiR48eKiws1JtvvunS/tprr8nhcDjv8vm9tm3bavXq1dq1a5diY2P122+/SZJiYmJUp04dTZ06VQUFBcXGZWVllc8kSnDxLMofz5rMnDmzWN9atWpJUrFvKO7du7d8fX01adKkYscxxrjcUl5aDz30kLKzs4utd0m1XuTr66uHHnpIy5cv1w8//FDs9d+v6x9r8vPzU+vWrWWMKfHfCXA14cwNcJWIjY1V165dNX78eKWlpaldu3b6/PPP9dFHH+mpp5665K3Jt912mz766CP16NFDffr00apVq1SnTh3Nnj1bgwYN0q233qr+/furYcOGSk9P16effqqoqKgSP9TdlZWVpSlTphRrb968uR5++GHdcccdmj59ugoKChQSEqLPP/9cP//8c7H+HTp0kCSNHz9e/fv3V/Xq1RUbG6vw8HBNmTJF48aNU1pamnr16qXAwED9/PPPWrlypR5//HGNGTOmTDUPHjxYixcvVmJiorZs2aIuXbooNzdX69ev14gRI/TAAw+UOG7atGnasGGDIiIiNGzYMLVu3VrHjx/Xtm3btH79eh0/flyS1K1bNwUHBysqKkpBQUHatWuX3nzzTfXs2bPYvirgquOdm7QAlLc/3gpuzIVbuEePHm2aNGliqlevblq0aGFeeeUVl1uTjXG9Ffyijz76yFSrVs3069fPFBYWGmOM2bBhg4mJiTF169Y1AQEBJjw83AwZMsR8++23znFxcXGmVq1axepLSkoqVl9JLt56XdLjnnvuMcYYc+jQIfPggw+aevXqmbp165q//OUv5siRI8VuszbGmMmTJ5uQkBDj4+NT7Lbw5cuXm86dO5tatWqZWrVqmRtuuMEkJCSYPXv2uNRT0i3YcXFxplmzZi5tZ8+eNePHjzfNmzc31atXN8HBwaZPnz5m//79zj4l1ZiZmWkSEhJMaGioc9w999xj5s6d6+zz9ttvmzvuuMNce+21xt/f34SHh5uxY8eaU6dOXXZNAds5jLnCHXAAAACVCHtuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsctV9iV9RUZGOHDmiwMDAS34VOwAAqFyMMTp9+rSaNGkiH58/Pzdz1YWbI0eOKDQ01NtlAAAANxw8eFBNmzb90z5XXbi5+LXkBw8eVJ06dbxcDQAAKI2cnByFhoaW6udFrrpwc/FSVJ06dQg3AABUMaXZUsKGYgAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwilfDzVdffaXY2Fg1adJEDodDq1atuuyYjRs36tZbb5W/v7+uv/56LVq0qNzrBAAAVYdXw01ubq7atWunWbNmlar/zz//rJ49e6pr167asWOHnnrqKT322GP67LPPyrlSAABQVXj1hzPvu+8+3XfffaXuP2fOHDVv3lwzZsyQJN144436+uuv9dprrykmJqa8ygQAAFVIldpzs3nzZkVHR7u0xcTEaPPmzZcck5eXp5ycHJcHAACwl1fP3JRVRkaGgoKCXNqCgoKUk5Oj3377TTVq1Cg2Jjk5WZMmTaqoEhX27KcV9l6ekjatp7dLAABcAp8rZVelzty4Y9y4cTp16pTzcfDgQW+XBAAAylGVOnMTHByszMxMl7bMzEzVqVOnxLM2kuTv7y9/f/+KKA8AAFQCVerMTWRkpFJTU13a1q1bp8jISC9VBAAAKhuvhpszZ85ox44d2rFjh6QLt3rv2LFD6enpki5cUho8eLCz//Dhw3XgwAH99a9/1e7du/XWW2/pb3/7m0aPHu2N8gEAQCXk1XDz7bff6pZbbtEtt9wiSUpMTNQtt9yiCRMmSJKOHj3qDDqS1Lx5c3366adat26d2rVrpxkzZuidd97hNnAAAODk1T03d911l4wxl3y9pG8fvuuuu7R9+/ZyrAoAAFRlVWrPDQAAwOUQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFa+Hm1mzZiksLEwBAQGKiIjQli1b/rT/zJkz1apVK9WoUUOhoaEaPXq0zp07V0HVAgCAys6r4WbZsmVKTExUUlKStm3bpnbt2ikmJkbHjh0rsf+SJUv07LPPKikpSbt27dL8+fO1bNkyPffccxVcOQAAqKy8Gm5SUlI0bNgwxcfHq3Xr1pozZ45q1qypBQsWlNh/06ZNioqK0sCBAxUWFqZu3bppwIABlz3bAwAArh5eCzf5+fnaunWroqOj/1mMj4+io6O1efPmEsfcfvvt2rp1qzPMHDhwQKtXr1aPHj0u+T55eXnKyclxeQAAAHtV89YbZ2dnq7CwUEFBQS7tQUFB2r17d4ljBg4cqOzsbHXu3FnGGJ0/f17Dhw//08tSycnJmjRpkkdrBwAAlZfXNxSXxcaNGzV16lS99dZb2rZtm1asWKFPP/1UkydPvuSYcePG6dSpU87HwYMHK7BiAABQ0bx25qZBgwby9fVVZmamS3tmZqaCg4NLHPPCCy9o0KBBeuyxxyRJN998s3Jzc/X4449r/Pjx8vEpntX8/f3l7+/v+QkAAIBKyWtnbvz8/NShQwelpqY624qKipSamqrIyMgSx5w9e7ZYgPH19ZUkGWPKr1gAAFBleO3MjSQlJiYqLi5OHTt2VKdOnTRz5kzl5uYqPj5ekjR48GCFhIQoOTlZkhQbG6uUlBTdcsstioiI0L59+/TCCy8oNjbWGXIAAMDVzavhpl+/fsrKytKECROUkZGh9u3ba+3atc5Nxunp6S5nap5//nk5HA49//zzOnz4sBo2bKjY2Fi99NJL3poCAACoZBzmKruek5OTo7p16+rUqVOqU6eOx48f9uynHj9meUub1tPbJQAALoHPlQvK8vldpe6WAgAAuBzCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFbfCzYEDBzxdBwAAgEe4FW6uv/56de3aVe+9957OnTvn6ZoAAADc5la42bZtm9q2bavExEQFBwfr3//937Vlyxa3Cpg1a5bCwsIUEBCgiIiIyx7n5MmTSkhIUOPGjeXv76+WLVtq9erVbr03AACwj1vhpn379nr99dd15MgRLViwQEePHlXnzp3Vpk0bpaSkKCsrq1THWbZsmRITE5WUlKRt27apXbt2iomJ0bFjx0rsn5+fr3vvvVdpaWn68MMPtWfPHs2bN08hISHuTAMAAFjoijYUV6tWTb1799YHH3ygl19+Wfv27dOYMWMUGhqqwYMH6+jRo386PiUlRcOGDVN8fLxat26tOXPmqGbNmlqwYEGJ/RcsWKDjx49r1apVioqKUlhYmO688061a9fuSqYBAAAsckXh5ttvv9WIESPUuHFjpaSkaMyYMdq/f7/WrVunI0eO6IEHHrjk2Pz8fG3dulXR0dH/LMbHR9HR0dq8eXOJYz7++GNFRkYqISFBQUFBatOmjaZOnarCwsJLvk9eXp5ycnJcHgAAwF7V3BmUkpKihQsXas+ePerRo4cWL16sHj16yMfnQlZq3ry5Fi1apLCwsEseIzs7W4WFhQoKCnJpDwoK0u7du0scc+DAAX3xxRd6+OGHtXr1au3bt08jRoxQQUGBkpKSShyTnJysSZMmuTNNAABQBbkVbmbPnq1HH31UQ4YMUePGjUvs06hRI82fP/+KivujoqIiNWrUSHPnzpWvr686dOigw4cP65VXXrlkuBk3bpwSExOdz3NychQaGurRugAAQOXhVrj56aefLtvHz89PcXFxl3y9QYMG8vX1VWZmpkt7ZmamgoODSxzTuHFjVa9eXb6+vs62G2+8URkZGcrPz5efn1+xMf7+/vL3979svQAAwA5u7blZuHChPvjgg2LtH3zwgd59991SHcPPz08dOnRQamqqs62oqEipqamKjIwscUxUVJT27dunoqIiZ9vevXvVuHHjEoMNAAC4+rgVbpKTk9WgQYNi7Y0aNdLUqVNLfZzExETNmzdP7777rnbt2qUnnnhCubm5io+PlyQNHjxY48aNc/Z/4okndPz4cY0aNUp79+7Vp59+qqlTpyohIcGdaQAAAAu5dVkqPT1dzZs3L9berFkzpaenl/o4/fr1U1ZWliZMmKCMjAy1b99ea9eudW4yTk9Pd25SlqTQ0FB99tlnGj16tNq2bauQkBCNGjVKzzzzjDvTAAAAFnIr3DRq1Eg7d+4sdjfUd999p2uvvbZMxxo5cqRGjhxZ4msbN24s1hYZGalvvvmmTO8BAACuHm5dlhowYICefPJJbdiwQYWFhSosLNQXX3yhUaNGqX///p6uEQAAoNTcOnMzefJkpaWl6Z577lG1ahcOUVRUpMGDB5dpzw0AAICnuRVu/Pz8tGzZMk2ePFnfffedatSooZtvvlnNmjXzdH0AAABl4la4uahly5Zq2bKlp2oBAAC4Ym6Fm8LCQi1atEipqak6duyYy/fOSNIXX3zhkeIAAADKyq1wM2rUKC1atEg9e/ZUmzZt5HA4PF0XAACAW9wKN0uXLtXf/vY39ejRw9P1AAAAXBG3bgX38/PT9ddf7+laAAAArphb4ebpp5/W66+/LmOMp+sBAAC4Im5dlvr666+1YcMGrVmzRjfddJOqV6/u8vqKFSs8UhwAAEBZuRVu6tWrpwcffNDTtQAAAFwxt8LNwoULPV0HAACAR7i150aSzp8/r/Xr1+vtt9/W6dOnJUlHjhzRmTNnPFYcAABAWbl15uaXX35R9+7dlZ6erry8PN17770KDAzUyy+/rLy8PM2ZM8fTdQIAAJSKW2duRo0apY4dO+rEiROqUaOGs/3BBx9Uamqqx4oDAAAoK7fO3PzP//yPNm3aJD8/P5f2sLAwHT582COFAQAAuMOtMzdFRUUqLCws1n7o0CEFBgZecVEAAADucivcdOvWTTNnznQ+dzgcOnPmjJKSkvhJBgAA4FVuXZaaMWOGYmJi1Lp1a507d04DBw7UTz/9pAYNGuj999/3dI0AAACl5la4adq0qb777jstXbpUO3fu1JkzZzR06FA9/PDDLhuMAQAAKppb4UaSqlWrpkceecSTtQAAAFwxt8LN4sWL//T1wYMHu1UMAADAlXIr3IwaNcrleUFBgc6ePSs/Pz/VrFmTcAMAALzGrbulTpw44fI4c+aM9uzZo86dO7OhGAAAeJXbvy31Ry1atNC0adOKndUBAACoSB4LN9KFTcZHjhzx5CEBAADKxK09Nx9//LHLc2OMjh49qjfffFNRUVEeKQwAAMAdboWbXr16uTx3OBxq2LCh7r77bs2YMcMTdQEAALjFrXBTVFTk6ToAAAA8wqN7bgAAALzNrTM3iYmJpe6bkpLizlsAAAC4xa1ws337dm3fvl0FBQVq1aqVJGnv3r3y9fXVrbfe6uzncDg8UyUAAEApuRVuYmNjFRgYqHfffVf169eXdOGL/eLj49WlSxc9/fTTHi0SAACgtNzaczNjxgwlJyc7g40k1a9fX1OmTOFuKQAA4FVuhZucnBxlZWUVa8/KytLp06evuCgAAAB3uRVuHnzwQcXHx2vFihU6dOiQDh06pOXLl2vo0KHq3bu3p2sEAAAoNbf23MyZM0djxozRwIEDVVBQcOFA1app6NCheuWVVzxaIAAAQFm4FW5q1qypt956S6+88or2798vSQoPD1etWrU8WhwAAEBZXdGX+B09elRHjx5VixYtVKtWLRljPFUXAACAW9wKN7/++qvuuecetWzZUj169NDRo0clSUOHDuU2cAAA4FVuhZvRo0erevXqSk9PV82aNZ3t/fr109q1az1WHAAAQFm5tefm888/12effaamTZu6tLdo0UK//PKLRwoDAABwh1tnbnJzc13O2Fx0/Phx+fv7X3FRAAAA7nIr3HTp0kWLFy92Pnc4HCoqKtL06dPVtWtXjxUHAABQVm5dlpo+fbruueceffvtt8rPz9df//pX/fjjjzp+/Lj+93//19M1AgAAlJpbZ27atGmjvXv3qnPnznrggQeUm5ur3r17a/v27QoPD/d0jQAAAKVW5jM3BQUF6t69u+bMmaPx48eXR00AAABuK/OZm+rVq2vnzp3lUQsAAMAVc+uy1COPPKL58+d7uhYAAIAr5taG4vPnz2vBggVav369OnToUOw3pVJSUjxSHAAAQFmVKdwcOHBAYWFh+uGHH3TrrbdKkvbu3evSx+FweK46AACAMipTuGnRooWOHj2qDRs2SLrwcwv/+Z//qaCgoHIpDgAAoKzKtOfmj7/6vWbNGuXm5nq0IAAAgCvh1obii/4YdgAAALytTOHG4XAU21PDHhsAAFCZlGnPjTFGQ4YMcf445rlz5zR8+PBid0utWLHCcxUCAACUQZnCTVxcnMvzRx55xKPFAAAAXKkyhZuFCxeWVx0AAAAecUUbigEAACobwg0AALBKpQg3s2bNUlhYmAICAhQREaEtW7aUatzSpUvlcDjUq1ev8i0QAABUGV4PN8uWLVNiYqKSkpK0bds2tWvXTjExMTp27NifjktLS9OYMWPUpUuXCqoUAABUBV4PNykpKRo2bJji4+PVunVrzZkzRzVr1tSCBQsuOaawsFAPP/ywJk2apOuuu+5Pj5+Xl6ecnByXBwAAsJdXw01+fr62bt2q6OhoZ5uPj4+io6O1efPmS4578cUX1ahRIw0dOvSy75GcnKy6des6H6GhoR6pHQAAVE5eDTfZ2dkqLCws9sObQUFBysjIKHHM119/rfnz52vevHmleo9x48bp1KlTzsfBgwevuG4AAFB5lel7brzt9OnTGjRokObNm6cGDRqUaoy/v7/zG5UBAID9vBpuGjRoIF9fX2VmZrq0Z2ZmKjg4uFj//fv3Ky0tTbGxsc62oqIiSVK1atW0Z88ehYeHl2/RAACgUvPqZSk/Pz916NBBqampzraioiKlpqYqMjKyWP8bbrhB33//vXbs2OF83H///eratat27NjBfhoAAOD9y1KJiYmKi4tTx44d1alTJ82cOVO5ubmKj4+XJA0ePFghISFKTk5WQECA2rRp4zK+Xr16klSsHQAAXJ28Hm769eunrKwsTZgwQRkZGWrfvr3Wrl3r3GScnp4uHx+v37EOAACqCK+HG0kaOXKkRo4cWeJrGzdu/NOxixYt8nxBAACgyuKUCAAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsUinCzaxZsxQWFqaAgABFRERoy5Ytl+w7b948denSRfXr11f9+vUVHR39p/0BAMDVxevhZtmyZUpMTFRSUpK2bdumdu3aKSYmRseOHSux/8aNGzVgwABt2LBBmzdvVmhoqLp166bDhw9XcOUAAKAy8nq4SUlJ0bBhwxQfH6/WrVtrzpw5qlmzphYsWFBi///+7//WiBEj1L59e91www165513VFRUpNTU1AquHAAAVEZeDTf5+fnaunWroqOjnW0+Pj6Kjo7W5s2bS3WMs2fPqqCgQNdcc02Jr+fl5SknJ8flAQAA7OXVcJOdna3CwkIFBQW5tAcFBSkjI6NUx3jmmWfUpEkTl4D0e8nJyapbt67zERoaesV1AwCAysvrl6WuxLRp07R06VKtXLlSAQEBJfYZN26cTp065XwcPHiwgqsEAAAVqZo337xBgwby9fVVZmamS3tmZqaCg4P/dOyrr76qadOmaf369Wrbtu0l+/n7+8vf398j9QIAgMrPq2du/Pz81KFDB5fNwBc3B0dGRl5y3PTp0zV58mStXbtWHTt2rIhSAQBAFeHVMzeSlJiYqLi4OHXs2FGdOnXSzJkzlZubq/j4eEnS4MGDFRISouTkZEnSyy+/rAkTJmjJkiUKCwtz7s2pXbu2ateu7bV5AACAysHr4aZfv37KysrShAkTlJGRofbt22vt2rXOTcbp6eny8fnnCabZs2crPz9fffr0cTlOUlKSJk6cWJGlAwCASsjr4UaSRo4cqZEjR5b42saNG12ep6WllX9BAACgyqrSd0sBAAD8EeEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxSKcLNrFmzFBYWpoCAAEVERGjLli1/2v+DDz7QDTfcoICAAN18881avXp1BVUKAAAqO6+Hm2XLlikxMVFJSUnatm2b2rVrp5iYGB07dqzE/ps2bdKAAQM0dOhQbd++Xb169VKvXr30ww8/VHDlAACgMvJ6uElJSdGwYcMUHx+v1q1ba86cOapZs6YWLFhQYv/XX39d3bt319ixY3XjjTdq8uTJuvXWW/Xmm29WcOUAAKAyqubNN8/Pz9fWrVs1btw4Z5uPj4+io6O1efPmEsds3rxZiYmJLm0xMTFatWpVif3z8vKUl5fnfH7q1ClJUk5OzhVWX7KivLPlctzyVF5rAQC4cnyuuB7TGHPZvl4NN9nZ2SosLFRQUJBLe1BQkHbv3l3imIyMjBL7Z2RklNg/OTlZkyZNKtYeGhrqZtX2qTvT2xUAAGxSnp8rp0+fVt26df+0j1fDTUUYN26cy5meoqIiHT9+XNdee60cDocXKyt/OTk5Cg0N1cGDB1WnTh1vl1OlsHbuYd3cw7q5j7VzT1VcN2OMTp8+rSZNmly2r1fDTYMGDeTr66vMzEyX9szMTAUHB5c4Jjg4uEz9/f395e/v79JWr14994uugurUqVNl/ngrG9bOPaybe1g397F27qlq63a5MzYXeXVDsZ+fnzp06KDU1FRnW1FRkVJTUxUZGVnimMjISJf+krRu3bpL9gcAAFcXr1+WSkxMVFxcnDp27KhOnTpp5syZys3NVXx8vCRp8ODBCgkJUXJysiRp1KhRuvPOOzVjxgz17NlTS5cu1bfffqu5c+d6cxoAAKCS8Hq46devn7KysjRhwgRlZGSoffv2Wrt2rXPTcHp6unx8/nmC6fbbb9eSJUv0/PPP67nnnlOLFi20atUqtWnTxltTqLT8/f2VlJRU7LIcLo+1cw/r5h7WzX2snXtsXzeHKc09VQAAAFWE17/EDwAAwJMINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwU0V89dVXio2NVZMmTeRwOC75Q6G/l5eXp/Hjx6tZs2by9/dXWFiYy6+tz5s3T126dFH9+vVVv359RUdHa8uWLeU4C+8oj7X7vaVLl8rhcKhXr16eLdzLymvdTp48qYSEBDVu3Fj+/v5q2bKlVq9eXU6zqHjltW4zZ85Uq1atVKNGDYWGhmr06NE6d+5cOc3CO8q6dkOGDJHD4Sj2uOmmm1z6zZo1S2FhYQoICFBERIR1/58rj3VLTk7Wv/7rvyowMFCNGjVSr169tGfPnnKeiecQbqqI3NxctWvXTrNmzSr1mL59+yo1NVXz58/Xnj179P7776tVq1bO1zdu3KgBAwZow4YN2rx5s0JDQ9WtWzcdPny4PKbgNeWxdhelpaVpzJgx6tKliydLrhTKY93y8/N17733Ki0tTR9++KH27NmjefPmKSQkpDym4BXlsW5LlizRs88+q6SkJO3atUvz58/XsmXL9Nxzz5XHFLymrGv3+uuv6+jRo87HwYMHdc011+gvf/mLs8+yZcuUmJiopKQkbdu2Te3atVNMTIyOHTtWXtOocOWxbl9++aUSEhL0zTffaN26dSooKFC3bt2Um5tbXtPwLIMqR5JZuXLln/ZZs2aNqVu3rvn1119Lfdzz58+bwMBA8+67715hhZWXJ9fu/Pnz5vbbbzfvvPOOiYuLMw888IDnCq1kPLVus2fPNtddd53Jz8/3cIWVk6fWLSEhwdx9990ubYmJiSYqKsoTZVZKpVm7P1q5cqVxOBwmLS3N2dapUyeTkJDgfF5YWGiaNGlikpOTPVVqpeKpdfujY8eOGUnmyy+/vMIKKwZnbiz18ccfq2PHjpo+fbpCQkLUsmVLjRkzRr/99tslx5w9e1YFBQW65pprKrDSyqe0a/fiiy+qUaNGGjp0qJcqrVxKs24ff/yxIiMjlZCQoKCgILVp00ZTp05VYWGhFyv3rtKs2+23366tW7c6L6ccOHBAq1evVo8ePbxVdqU0f/58RUdHq1mzZpIunCncunWroqOjnX18fHwUHR2tzZs3e6vMSueP61aSU6dOSVKV+Xzw+s8voHwcOHBAX3/9tQICArRy5UplZ2drxIgR+vXXX7Vw4cISxzzzzDNq0qSJy/8IrkalWbuvv/5a8+fP144dO7xbbCVSmnU7cOCAvvjiCz388MNavXq19u3bpxEjRqigoEBJSUlenoF3lGbdBg4cqOzsbHXu3FnGGJ0/f17Dhw+37rLUlThy5IjWrFmjJUuWONuys7NVWFjo/Dmfi4KCgrR79+6KLrFSKmnd/qioqEhPPfWUoqKiqs5PHXn71BHKTqU47XjvvfeagIAAc/LkSWfb8uXLjcPhMGfPni3WPzk52dSvX9989913ni63UvHE2uXk5JiwsDCzevVq5+tclird31yLFi1MaGioOX/+vLPPjBkzTHBwcLnU7W2eWrcNGzaYoKAgM2/ePLNz506zYsUKExoaal588cXyLN+rSrN2vzd16lRz7bXXmry8PGfb4cOHjSSzadMml75jx441nTp18lSplYon1u2Phg8fbpo1a2YOHjzogQorBmduLNW4cWOFhISobt26zrYbb7xRxhgdOnRILVq0cLa/+uqrmjZtmtavX6+2bdt6o9xK5XJrl5ubq7S0NMXGxjpfLyoqkiRVq1ZNe/bsUXh4eIXX7W2l+Ztr3LixqlevLl9fX5c+GRkZys/Pl5+fnzdK96rSrNsLL7ygQYMG6bHHHpMk3XzzzcrNzdXjjz+u8ePHu/y48NXIGKMFCxZo0KBBLn9DDRo0kK+vrzIzM136Z2ZmKjg4uKLLrHQutW6/N3LkSH3yySf66quv1LRp0wqu0H1X938RFouKitKRI0d05swZZ9vevXvl4+Pj8gc6ffp0TZ48WWvXrlXHjh29UWqlc7m1u+GGG/T9999rx44dzsf999+vrl27aseOHQoNDfVi9d5Tmr+5qKgo7du3zxkGL/Zp3LjxVRlspNKt29mzZ4sFmIsB0fDbx/ryyy+1b9++Yvvf/Pz81KFDB6WmpjrbioqKlJqaqsjIyIous9K51LpJF/6uRo4cqZUrV+qLL75Q8+bNvVDhFfDeSSOUxenTp8327dvN9u3bjSSTkpJitm/fbn755RdjjDHPPvusGTRokEv/pk2bmj59+pgff/zRfPnll6ZFixbmsccec/aZNm2a8fPzMx9++KE5evSo83H69OkKn195Ko+1+yMbL0uVx7qlp6ebwMBAM3LkSLNnzx7zySefmEaNGpkpU6ZU+PzKS3msW1JSkgkMDDTvv/++OXDggPn8889NeHi46du3b4XPrzyVde0ueuSRR0xERESJx1y6dKnx9/c3ixYtMv/4xz/M448/burVq2cyMjLKdS4VqTzW7YknnjB169Y1GzdudPl8KGlbQ2VEuKkiNmzYYCQVe8TFxRljLny43nnnnS5jdu3aZaKjo02NGjVM06ZNTWJiossfZrNmzUo8ZlJSUsVNrAKUx9r9kY3hprzWbdOmTSYiIsL4+/ub6667zrz00ksue3CquvJYt4KCAjNx4kQTHh5uAgICTGhoqBkxYoQ5ceJExU2sArizdidPnjQ1atQwc+fOveRx33jjDfMv//Ivxs/Pz3Tq1Ml888035TiLilce61bS8SSZhQsXlu9kPMRhDOc0AQCAPdhzAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACr/D/vevnJpwE9FgAAAABJRU5ErkJggg==",
-      "text/plain": [
-       "<Figure size 640x480 with 1 Axes>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "all_token_latencies = valid_df['end_to_end_latency_s'].apply(pd.Series).stack()\n",
-    "all_token_latencies = all_token_latencies.reset_index(drop=True)\n",
-    "all_token_latencies.plot.hist(title=\"Token Latencies\")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

llmperf/llm_correctness.py DELETED Viewed

@@ -1,309 +0,0 @@
-import argparse
-import json
-import os
-from pathlib import Path
-import random
-import re
-import time
-from typing import Any, Dict, List, Optional, Tuple
-import num2words
-import ray
-from tqdm import tqdm
-from llmperf import common_metrics
-from llmperf.common import SUPPORTED_APIS, construct_clients
-from llmperf.models import RequestConfig
-from llmperf.requests_launcher import RequestsLauncher
-from llmperf.utils import (
-    LLMPerfResults,
-)
-MAX_RANDOM_NUMBER = 10000
-def llm_correctness(
-    model: str,
-    additional_sampling_params: Optional[Dict[str, Any]] = None,
-    num_concurrent_requests: int = 1,
-    max_num_completed_requests: int = 500,
-    test_timeout_s=90,
-    llm_api="chat",
-) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
-    """Get the token throughput and latencies for the given model.
-    Args:
-        model: The name of the model to query.
-        additional_sampling_params: Additional sampling parameters to send with the request.
-            For more information see the LLM APIs documentation for the completions
-        num_concurrent_requests: The number of concurrent requests to make. Increase
-            this to increase the amount of load and vice versa.
-        test_timeout_s: The amount of time to run the test for before reporting results.
-        llm_api: The type of request to make. Either "chat" or "litellm".
-    Returns:
-        A tuple containing summary metrics and raw results from the test.
-    """
-    if not additional_sampling_params:
-        additional_sampling_params = {}
-    clients = construct_clients(llm_api=llm_api, num_clients=num_concurrent_requests)
-    req_launcher = RequestsLauncher(clients)
-    start_time = time.monotonic()
-    num_errored_requests = 0
-    num_mismatched_requests = 0
-    num_completed_requests = 0
-    sampling_params = {"temperature": 0.0}
-    sampling_params.update(additional_sampling_params)
-    completed_requests = []
-    iter = 0
-    pbar = tqdm(total=max_num_completed_requests)
-    while (
-        time.monotonic() - start_time < test_timeout_s
-        and num_completed_requests < max_num_completed_requests
-    ):
-        iter += 1
-        rnd_number = random.randint(0, MAX_RANDOM_NUMBER)
-        rnd_num_words = num2words.num2words(rnd_number)
-        prompt = f"Convert the following sequence of words into a number: {rnd_num_words}.\nPrint the number first."
-        request_config = RequestConfig(
-            model=model,
-            prompt=(prompt, 0),
-            sampling_params=sampling_params,
-            metadata={"rnd_number": rnd_number},
-            llm_api=llm_api,
-        )
-        req_launcher.launch_requests(request_config)
-        if not (iter % num_concurrent_requests):
-            completed_requests.extend(req_launcher.get_next_ready())
-        pbar.update(len(completed_requests) - num_completed_requests)
-        num_completed_requests = len(completed_requests)
-    pbar.close()
-    end_time = time.monotonic()
-    if end_time - start_time >= test_timeout_s:
-        print("Test timed out before all requests could be completed.")
-    raw_results = []
-    print("Mismatched and errored requests.")
-    for out in completed_requests:
-        metrics, generated_text, completed_request_config = out
-        raw_results.append(
-            {
-                "metrics": metrics,
-                "generated_text": generated_text,
-                "request_config": dict(completed_request_config),
-            }
-        )
-        # if there were no errors when making request.
-        if not metrics[common_metrics.ERROR_CODE]:
-            try:
-                commas_between_numbers_re = r"(\d+),(?=\d)"
-                gen_text_commas_removed = re.sub(
-                    commas_between_numbers_re, r"\1", generated_text
-                )
-                nums = re.findall(r"\d+", gen_text_commas_removed)
-                generated_text = gen_text_commas_removed.replace("\n", " ")
-                assert str(completed_request_config.metadata["rnd_number"]) in nums
-            except:
-                num_mismatched_requests += 1
-                print(
-                    f"    mismatched request: {generated_text}, expected: {completed_request_config.metadata['rnd_number']}"
-                )
-        else:
-            num_errored_requests += 1
-            print(
-                f"    The request errored: {metrics[common_metrics.ERROR_CODE]}, "
-                f"{metrics[common_metrics.ERROR_MSG]} "
-            )
-    print()
-    error_rate = num_errored_requests / num_completed_requests
-    mismatch_rate = num_mismatched_requests / num_completed_requests
-    num_non_errored_requests = num_completed_requests - num_errored_requests
-    summary_metrics = {}
-    summary_metrics[common_metrics.NUM_ERRORS] = num_errored_requests
-    summary_metrics["num_mismatched_requests"] = num_mismatched_requests
-    summary_metrics["error_rate"] = error_rate
-    summary_metrics["mismatch_rate"] = mismatch_rate
-    summary_metrics[common_metrics.NUM_COMPLETED_REQUESTS] = num_completed_requests
-    summary_metrics["num_non_errored_requests"] = num_non_errored_requests
-    # Metadata
-    summary_metrics["model"] = model
-    summary_metrics["num_concurrent_requests"] = num_concurrent_requests
-    summary_metrics["additional_sampling_params"] = additional_sampling_params
-    summary_metrics["llm_api"] = llm_api
-    return summary_metrics, raw_results
-def run(
-    llm_api: str,
-    model: str,
-    test_timeout_s: int,
-    max_num_completed_requests: int,
-    num_concurrent_requests: int,
-    additional_sampling_params: str,
-    results_dir: str,
-    user_metadata: Dict[str, str],
-):
-    """
-    Args:
-        llm_api: The type of request to make. Either "chat" or "litellm".
-        model: The name of the model to query.
-        max_num_completed_requests: The number of requests to complete before finishing the test.
-        test_timeout_s: The amount of time to run the test for before reporting results.
-        num_concurrent_requests: The number of concurrent requests to make. Increase
-            this to increase the amount of load and vice versa.
-        mean_input_tokens: The mean number of tokens to send in the prompt for the request.
-        stddev_input_tokens: The standard deviation of the number of tokens to send in the prompt for the request.
-        mean_output_tokens: The mean number of tokens to generate per request.
-        stddev_output_tokens: The standard deviation of the number of tokens to generate per request.
-        additional_sampling_params: Additional sampling parameters to send with the request.
-            For more information see the LLM APIs documentation for the completions.
-        results_dir: The directory to save the results to.
-    """
-    summary_metrics, raw_results = llm_correctness(
-        model=model,
-        llm_api=llm_api,
-        test_timeout_s=test_timeout_s,
-        max_num_completed_requests=max_num_completed_requests,
-        num_concurrent_requests=num_concurrent_requests,
-        additional_sampling_params=json.loads(additional_sampling_params),
-    )
-    time.sleep(2)
-    print(
-        f"Results for llm correctness test for {model} queried with the {llm_api} api."
-    )
-    print(
-        f"Errors: {summary_metrics[common_metrics.NUM_ERRORS]}, "
-        f"Error rate: {summary_metrics['error_rate']}"
-    )
-    print(
-        f"Mismatched: {summary_metrics['num_mismatched_requests']}, "
-        f"Mismatch rate: {summary_metrics['mismatch_rate']}"
-    )
-    print(f"Completed: {summary_metrics[common_metrics.NUM_COMPLETED_REQUESTS]}")
-    print(f"Completed without errors: {summary_metrics['num_non_errored_requests']}")
-    if results_dir:
-        file_name = f"{model}_correctness"
-        file_name = re.sub(r"[^\w\d-]+", "-", file_name)
-        file_name = re.sub(r"-{2,}", "-", file_name)
-        summary_file_name = f"{file_name}_summary"
-        individual_responses_filename = f"{file_name}_individual_responses"
-        summary_metrics.update(user_metadata)
-        results = LLMPerfResults(name=summary_file_name, metadata=summary_metrics)
-        results_dir = Path(results_dir)
-        if not results_dir.exists():
-            results_dir.mkdir(parents=True)
-        elif not results_dir.is_dir():
-            raise ValueError(f"{results_dir} is not a directory")
-        with open(results_dir / f"{summary_file_name}.json", "w") as f:
-            json.dump(results.to_dict(), f, indent=4)
-        with open(results_dir / f"{individual_responses_filename}.json", "w") as f:
-            json.dump(raw_results, f, indent=4)
-args = argparse.ArgumentParser(description="Run a correctness test for a given model.")
-args.add_argument(
-    "--model", type=str, required=True, help="The model to use for this load test."
-)
-args.add_argument(
-    "--num-concurrent-requests",
-    type=int,
-    default=10,
-    help=("The number of concurrent requests to send. (default: %(default)s)"),
-)
-args.add_argument(
-    "--timeout",
-    type=int,
-    default=90,
-    help="The amount of time to run the load test for. (default: %(default)s)",
-)
-args.add_argument(
-    "--max-num-completed-requests",
-    type=int,
-    default=50,
-    help=(
-        "The number of requests to complete before finishing the test. Note "
-        "that its possible for the test to timeout first. (default: %(default)s)"
-    ),
-)
-args.add_argument(
-    "--additional-sampling-params",
-    type=str,
-    default="{}",
-    help=(
-        "Additional sampling params to send with the each request to the LLM API. "
-        "(default: %(default)s) No additional sampling params are sent."
-    ),
-)
-args.add_argument(
-    "--results-dir",
-    type=str,
-    default="",
-    help=(
-        "The directory to save the results to. "
-        "(`default: %(default)s`) No results are saved)"
-    ),
-)
-args.add_argument(
-    "--llm-api",
-    type=str,
-    default="openai",
-    help=(
-        f"The type of request to make. The supported llm apis are {SUPPORTED_APIS} "
-        " (`default: %(default)s`)"
-    ),
-)
-args.add_argument(
-    "--metadata",
-    type=str,
-    default="",
-    help=(
-        "A comma separated list of metadata to include in the results, e.g. "
-        "name=foo,bar=1. These will be added to the metadata field of the results. "
-    ),
-)
-if __name__ == "__main__":
-    args = args.parse_args()
-    env_vars = dict(os.environ)
-    ray.init(runtime_env={"env_vars": env_vars})
-    # Parse user metadata.
-    user_metadata = {}
-    if args.metadata:
-        for item in args.metadata.split(","):
-            key, value = item.split("=")
-            user_metadata[key] = value
-    run(
-        llm_api=args.llm_api,
-        model=args.model,
-        test_timeout_s=args.timeout,
-        max_num_completed_requests=args.max_num_completed_requests,
-        num_concurrent_requests=args.num_concurrent_requests,
-        additional_sampling_params=args.additional_sampling_params,
-        results_dir=args.results_dir,
-        user_metadata=user_metadata,
-    )

llmperf/pre-commit.sh DELETED Viewed

@@ -1,5 +0,0 @@
-#!/bin/bash
-echo "Running pre-hooks before committing..."
-echo "======FORMAT====="
-black . -q

llmperf/pyproject.toml DELETED Viewed

@@ -1,23 +0,0 @@
-[build-system]
-requires = ["setuptools>=43.0.0", "wheel"]
-build-backend = "setuptools.build_meta"
-[project]
-name = "LLMPerf"
-version = "0.1.0"
-description = "A framework for load testing LLM APIs"
-authors = [{name="Avnish Narayan", email="[email protected]"}]
-license = {text= "Apache-2.0"}
-requires-python = ">=3.8, <3.11"
-dependencies = ["pydantic<2.5",
-                "ray",
-                "pytest>=6.0",
-                "seaborn>=0.11",
-                "awscli>=1.22",
-                "typer>=0.4",
-                "litellm>=0.1.738",
-                "num2words",
-                "transformers",
-                "tqdm",
-                "boto3",
-                "google-cloud-aiplatform"]

llmperf/requirements-dev.txt DELETED Viewed

	@@ -1,2 +0,0 @@
1	- # For lints
2	- black

llmperf/src/llmperf/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	-

llmperf/src/llmperf/common.py DELETED Viewed

@@ -1,38 +0,0 @@
-from typing import List
-from llmperf.ray_clients.litellm_client import LiteLLMClient
-from llmperf.ray_clients.openai_chat_completions_client import (
-    OpenAIChatCompletionsClient,
-)
-from llmperf.ray_clients.sagemaker_client import SageMakerClient
-from llmperf.ray_clients.vertexai_client import VertexAIClient
-from llmperf.ray_llm_client import LLMClient
-SUPPORTED_APIS = ["openai", "anthropic", "litellm"]
-def construct_clients(llm_api: str, num_clients: int) -> List[LLMClient]:
-    """Construct LLMClients that will be used to make requests to the LLM API.
-    Args:
-        llm_api: The name of the LLM API to use.
-        num_clients: The number of concurrent requests to make.
-    Returns:
-        The constructed LLMCLients
-    """
-    if llm_api == "openai":
-        clients = [OpenAIChatCompletionsClient.remote() for _ in range(num_clients)]
-    elif llm_api == "sagemaker":
-        clients = [SageMakerClient.remote() for _ in range(num_clients)]
-    elif llm_api == "vertexai":
-        clients = [VertexAIClient.remote() for _ in range(num_clients)]
-    elif llm_api in SUPPORTED_APIS:
-        clients = [LiteLLMClient.remote() for _ in range(num_clients)]
-    else:
-        raise ValueError(
-            f"llm_api must be one of the supported LLM APIs: {SUPPORTED_APIS}"
-        )
-    return clients

llmperf/src/llmperf/common_metrics.py DELETED Viewed

@@ -1,17 +0,0 @@
-# TODO (Avnishn): compute metrics in class
-INTER_TOKEN_LAT = "inter_token_latency_s"
-TTFT = "ttft_s"
-E2E_LAT = "end_to_end_latency_s"
-NUM_INPUT_TOKENS = "number_input_tokens"
-NUM_OUTPUT_TOKENS = "number_output_tokens"
-NUM_TOTAL_TOKENS = "number_total_tokens"
-REQ_OUTPUT_THROUGHPUT = "request_output_throughput_token_per_s"
-ERROR_MSG = "error_msg"
-ERROR_CODE = "error_code"
-ERROR_CODE_FREQ = "error_code_frequency"
-NUM_ERRORS = "number_errors"
-OUTPUT_THROUGHPUT = "mean_output_throughput_token_per_s"
-NUM_COMPLETED_REQUESTS = "num_completed_requests"
-COMPLETED_REQUESTS_PER_MIN = "num_completed_requests_per_min"
-ERROR_RATE = "error_rate"
-NUM_REQ_STARTED = "num_requests_started"

llmperf/src/llmperf/models.py DELETED Viewed

@@ -1,21 +0,0 @@
-from typing import Any, Dict, List, Optional, Tuple
-from pydantic import BaseModel
-class RequestConfig(BaseModel):
-    """The configuration for a request to the LLM API.
-    Args:
-        model: The model to use.
-        prompt: The prompt to provide to the LLM API.
-        sampling_params: Additional sampling parameters to send with the request.
-            For more information see the Router app's documentation for the completions
-        llm_api: The name of the LLM API to send the request to.
-        metadata: Additional metadata to attach to the request for logging or validation purposes.
-    """
-    model: str
-    prompt: Tuple[str, int]
-    sampling_params: Optional[Dict[str, Any]] = None
-    llm_api: Optional[str] = None
-    metadata: Optional[Dict[str, Any]] = None

llmperf/src/llmperf/ray_clients/__init__.py DELETED Viewed

File without changes

llmperf/src/llmperf/ray_clients/litellm_client.py DELETED Viewed

@@ -1,100 +0,0 @@
-import time
-from typing import Any, Dict
-import ray
-from llmperf.ray_llm_client import LLMClient
-from llmperf.models import RequestConfig
-from llmperf import common_metrics
-@ray.remote
-class LiteLLMClient(LLMClient):
-    """Client for LiteLLM Completions API."""
-    def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]:
-        # litellm package isn't serializable, so we import it within the function
-        # to maintain compatibility with ray.
-        from litellm import completion, validate_environment
-        prompt = request_config.prompt
-        prompt, prompt_len = prompt
-        message = [
-            {"role": "system", "content": ""},
-            {"role": "user", "content": prompt},
-        ]
-        assert (
-            request_config.llm_api is not None
-        ), "the request config's llm_api must be set."
-        if request_config.llm_api == "litellm":
-            model = request_config.model
-        else:
-            model = request_config.llm_api + "/" + request_config.model
-        validation_result = validate_environment(model)
-        if validation_result["missing_keys"]:
-            raise ValueError(
-                f"The following environment vars weren't found but were necessary for "
-                f"the model {request_config.model}: {validation_result['missing_keys']}"
-            )
-        body = {
-            "model": model,
-            "messages": message,
-            "stream": True,
-        }
-        sampling_params = request_config.sampling_params
-        body.update(sampling_params or {})
-        time_to_next_token = []
-        tokens_received = 0
-        ttft = 0
-        error_response_code = -1
-        generated_text = ""
-        error_msg = ""
-        output_throughput = 0
-        total_request_time = 0
-        metrics = {}
-        metrics[common_metrics.ERROR_CODE] = None
-        metrics[common_metrics.ERROR_MSG] = ""
-        try:
-            start_time = time.monotonic()
-            most_recent_received_token_time = time.monotonic()
-            response = completion(**body)
-            ttft = 0
-            for tok in response:
-                if tok.choices[0].delta:
-                    delta = tok.choices[0].delta
-                    if delta.get("content", None):
-                        if ttft == 0:
-                            ttft = time.monotonic() - start_time
-                            time_to_next_token.append(ttft)
-                        else:
-                            time_to_next_token.append(
-                                time.monotonic() - most_recent_received_token_time
-                            )
-                        generated_text += delta["content"]
-                        most_recent_received_token_time = time.monotonic()
-                        tokens_received += 1
-            total_request_time = time.monotonic() - start_time
-            output_throughput = tokens_received / total_request_time
-        except Exception as e:
-            metrics[common_metrics.ERROR_MSG] = error_msg
-            metrics[common_metrics.ERROR_CODE] = error_response_code
-            print(f"Warning Or Error: {e}")
-            print(error_response_code)
-        metrics[common_metrics.INTER_TOKEN_LAT] = sum(time_to_next_token)
-        metrics[common_metrics.TTFT] = ttft
-        metrics[common_metrics.E2E_LAT] = total_request_time
-        metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput
-        metrics[common_metrics.NUM_TOTAL_TOKENS] = tokens_received + prompt_len
-        metrics[common_metrics.NUM_OUTPUT_TOKENS] = tokens_received
-        metrics[common_metrics.NUM_INPUT_TOKENS] = prompt_len
-        return metrics, generated_text, request_config

llmperf/src/llmperf/ray_clients/openai_chat_completions_client.py DELETED Viewed

@@ -1,120 +0,0 @@
-import json
-import os
-import time
-from typing import Any, Dict
-import ray
-import requests
-from llmperf.ray_llm_client import LLMClient
-from llmperf.models import RequestConfig
-from llmperf import common_metrics
-@ray.remote
-class OpenAIChatCompletionsClient(LLMClient):
-    """Client for OpenAI Chat Completions API."""
-    def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]:
-        prompt = request_config.prompt
-        prompt, prompt_len = prompt
-        message = [
-            {"role": "system", "content": ""},
-            {"role": "user", "content": prompt},
-        ]
-        model = request_config.model
-        body = {
-            "model": model,
-            "messages": message,
-            "stream": True,
-        }
-        sampling_params = request_config.sampling_params
-        body.update(sampling_params or {})
-        time_to_next_token = []
-        tokens_received = 0
-        ttft = 0
-        error_response_code = -1
-        generated_text = ""
-        error_msg = ""
-        output_throughput = 0
-        total_request_time = 0
-        metrics = {}
-        metrics[common_metrics.ERROR_CODE] = None
-        metrics[common_metrics.ERROR_MSG] = ""
-        start_time = time.monotonic()
-        most_recent_received_token_time = time.monotonic()
-        address = os.environ.get("OPENAI_API_BASE")
-        if not address:
-            raise ValueError("the environment variable OPENAI_API_BASE must be set.")
-        key = os.environ.get("OPENAI_API_KEY")
-        if not key:
-            raise ValueError("the environment variable OPENAI_API_KEY must be set.")
-        headers = {"Authorization": f"Bearer {key}"}
-        if not address:
-            raise ValueError("No host provided.")
-        if not address.endswith("/"):
-            address = address + "/"
-        address += "chat/completions"
-        try:
-            with requests.post(
-                address,
-                json=body,
-                stream=True,
-                timeout=180,
-                headers=headers,
-            ) as response:
-                if response.status_code != 200:
-                    error_msg = response.text
-                    error_response_code = response.status_code
-                    response.raise_for_status()
-                for chunk in response.iter_lines(chunk_size=None):
-                    chunk = chunk.strip()
-                    if not chunk:
-                        continue
-                    stem = "data: "
-                    chunk = chunk[len(stem) :]
-                    if chunk == b"[DONE]":
-                        continue
-                    tokens_received += 1
-                    data = json.loads(chunk)
-                    if "error" in data:
-                        error_msg = data["error"]["message"]
-                        error_response_code = data["error"]["code"]
-                        raise RuntimeError(data["error"]["message"])
-                    delta = data["choices"][0]["delta"]
-                    if delta.get("content", None):
-                        if not ttft:
-                            ttft = time.monotonic() - start_time
-                            time_to_next_token.append(ttft)
-                        else:
-                            time_to_next_token.append(
-                                time.monotonic() - most_recent_received_token_time
-                            )
-                        most_recent_received_token_time = time.monotonic()
-                        generated_text += delta["content"]
-            total_request_time = time.monotonic() - start_time
-            output_throughput = tokens_received / total_request_time
-        except Exception as e:
-            metrics[common_metrics.ERROR_MSG] = error_msg
-            metrics[common_metrics.ERROR_CODE] = error_response_code
-            print(f"Warning Or Error: {e}")
-            print(error_response_code)
-        metrics[common_metrics.INTER_TOKEN_LAT] = sum(time_to_next_token) #This should be same as metrics[common_metrics.E2E_LAT]. Leave it here for now
-        metrics[common_metrics.TTFT] = ttft
-        metrics[common_metrics.E2E_LAT] = total_request_time
-        metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput
-        metrics[common_metrics.NUM_TOTAL_TOKENS] = tokens_received + prompt_len
-        metrics[common_metrics.NUM_OUTPUT_TOKENS] = tokens_received
-        metrics[common_metrics.NUM_INPUT_TOKENS] = prompt_len
-        return metrics, generated_text, request_config

llmperf/src/llmperf/ray_clients/sagemaker_client.py DELETED Viewed

@@ -1,158 +0,0 @@
-import io
-import json
-import os
-import time
-from typing import Any, Dict
-import boto3
-import ray
-from transformers import LlamaTokenizerFast
-from llmperf.ray_llm_client import LLMClient
-from llmperf.models import RequestConfig
-from llmperf import common_metrics
-@ray.remote
-class SageMakerClient(LLMClient):
-    """Client for OpenAI Chat Completions API."""
-    def __init__(self):
-        # Sagemaker doesn't return the number of tokens that are generated so we approximate it by
-        # using the llama tokenizer.
-        self.tokenizer = LlamaTokenizerFast.from_pretrained(
-            "hf-internal-testing/llama-tokenizer"
-        )
-    def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]:
-        if not os.environ.get("AWS_ACCESS_KEY_ID"):
-            raise ValueError("AWS_ACCESS_KEY_ID must be set.")
-        if not os.environ.get("AWS_SECRET_ACCESS_KEY"):
-            raise ValueError("AWS_SECRET_ACCESS_KEY must be set.")
-        if not os.environ.get("AWS_REGION_NAME"):
-            raise ValueError("AWS_REGION_NAME must be set.")
-        prompt = request_config.prompt
-        prompt, prompt_len = prompt
-        message = [
-            {"role": "system", "content": ""},
-            {"role": "user", "content": prompt},
-        ]
-        model = request_config.model
-        sm_runtime = boto3.client(
-            "sagemaker-runtime", region_name=os.environ.get("AWS_REGION_NAME")
-        )
-        sampling_params = request_config.sampling_params
-        if "max_tokens" in sampling_params:
-            sampling_params["max_new_tokens"] = sampling_params["max_tokens"]
-            del sampling_params["max_tokens"]
-        message = {
-            "inputs": [
-                [
-                    {"role": "system", "content": ""},
-                    {"role": "user", "content": prompt},
-                ]
-            ],
-            "parameters": {
-                **request_config.sampling_params,
-            },
-        }
-        time_to_next_token = []
-        tokens_received = 0
-        ttft = 0
-        error_response_code = None
-        generated_text = ""
-        error_msg = ""
-        output_throughput = 0
-        total_request_time = 0
-        metrics = {}
-        start_time = time.monotonic()
-        most_recent_received_token_time = time.monotonic()
-        try:
-            response = sm_runtime.invoke_endpoint_with_response_stream(
-                EndpointName=model,
-                ContentType="application/json",
-                Body=json.dumps(message),
-                CustomAttributes="accept_eula=true",
-            )
-            event_stream = response["Body"]
-            json_byte = b""
-            for line, ttft, _ in LineIterator(event_stream):
-                json_byte += line
-                time_to_next_token.append(
-                    time.monotonic() - most_recent_received_token_time
-                )
-                most_recent_received_token_time = time.monotonic()
-            ttft = ttft - start_time
-            resp = json.loads(json_byte)
-            total_request_time = time.monotonic() - start_time
-            generated_text = resp[0]["generation"]["content"]
-            tokens_received = len(self.tokenizer.encode(generated_text))
-            output_throughput = tokens_received / total_request_time
-        except Exception as e:
-            print(f"Warning Or Error: {e}")
-            print(error_response_code)
-            error_msg = str(e)
-            error_response_code = 500
-        metrics[common_metrics.ERROR_MSG] = error_msg
-        metrics[common_metrics.ERROR_CODE] = error_response_code
-        metrics[common_metrics.INTER_TOKEN_LAT] = time_to_next_token
-        metrics[common_metrics.TTFT] = ttft
-        metrics[common_metrics.E2E_LAT] = total_request_time
-        metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput
-        metrics[common_metrics.NUM_TOTAL_TOKENS] = tokens_received + prompt_len
-        metrics[common_metrics.NUM_OUTPUT_TOKENS] = tokens_received
-        metrics[common_metrics.NUM_INPUT_TOKENS] = prompt_len
-        return metrics, generated_text, request_config
-class LineIterator:
-    """
-    A helper class for parsing the byte stream input.
-    Reference: https://aws.amazon.com/blogs/machine-learning/elevating-the-generative-ai-experience-introducing-streaming-support-in-amazon-sagemaker-hosting/
-    """
-    def __init__(self, stream):
-        self.byte_iterator = iter(stream)
-        self.buffer = io.BytesIO()
-        self.read_pos = 0
-        self.ttft = 0
-    def __iter__(self):
-        return self
-    def __next__(self):
-        while True:
-            self.buffer.seek(self.read_pos)
-            line = self.buffer.readline()
-            if line and line[-1] == ord("\n"):
-                if self.ttft == 0:
-                    self.ttft = time.monotonic()
-                self.read_pos += len(line)
-                return line[:-1], self.ttft, time.monotonic()
-            # kyle: dealing with last ']' for chat output
-            if line and self.read_pos == self.buffer.getbuffer().nbytes - 1:
-                self.read_pos += 1
-                return line, self.ttft, time.monotonic()
-            try:
-                chunk = next(self.byte_iterator)
-            except StopIteration:
-                if self.read_pos < self.buffer.getbuffer().nbytes:
-                    continue
-                raise
-            if "PayloadPart" not in chunk:
-                print("Unknown event type:" + chunk)
-                continue
-            self.buffer.seek(0, io.SEEK_END)
-            self.buffer.write(chunk["PayloadPart"]["Bytes"])

llmperf/src/llmperf/ray_clients/vertexai_client.py DELETED Viewed

@@ -1,135 +0,0 @@
-import json
-import os
-import time
-from typing import Any, Dict
-import ray
-import requests
-from transformers import LlamaTokenizerFast
-from llmperf.ray_llm_client import LLMClient
-from llmperf.models import RequestConfig
-from llmperf import common_metrics
-@ray.remote
-class VertexAIClient(LLMClient):
-    """Client for VertexAI API."""
-    def __init__(self):
-        # VertexAI doesn't return the number of tokens that are generated so we approximate it by
-        # using the llama tokenizer.
-        self.tokenizer = LlamaTokenizerFast.from_pretrained(
-            "hf-internal-testing/llama-tokenizer"
-        )
-    def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]:
-        project_id = os.environ.get("GCLOUD_PROJECT_ID")
-        region = os.environ.get("GCLOUD_REGION")
-        endpoint_id = os.environ.get("VERTEXAI_ENDPOINT_ID")
-        access_token = os.environ.get("GCLOUD_ACCESS_TOKEN").strip()
-        if not project_id:
-            raise ValueError("the environment variable GCLOUD_PROJECT_ID must be set.")
-        if not region:
-            raise ValueError("the environment variable GCLOUD_REGION must be set.")
-        if not endpoint_id:
-            raise ValueError(
-                "the environment variable VERTEXAI_ENDPOINT_ID must be set."
-            )
-        if not access_token:
-            raise ValueError(
-                "the environment variable GCLOUD_ACCESS_TOKEN must be set."
-            )
-        prompt = request_config.prompt
-        prompt, prompt_len = prompt
-        time_to_next_token = []
-        tokens_received = 0
-        ttft = 0
-        generated_text = ""
-        output_throughput = 0
-        total_request_time = 0
-        metrics = {}
-        metrics[common_metrics.ERROR_CODE] = None
-        metrics[common_metrics.ERROR_MSG] = ""
-        try:
-            # Define the URL for the request
-            url = (
-                f"https://{region}-aiplatform.googleapis.com/v1/projects/"
-                f"{project_id}/locations/{region}/endpoints/{endpoint_id}:predict"
-            )
-            # Define the headers
-            headers = {
-                "Authorization": f"Bearer {access_token}",
-                "Content-Type": "application/json",
-            }
-            sampling_params = request_config.sampling_params
-            if "max_new_tokens" in sampling_params:
-                sampling_params["maxOutputTokens"] = sampling_params.pop(
-                    "max_new_tokens"
-                )
-            # Define the data payload
-            data = {"instances": [{"prompt": prompt}], "parameters": sampling_params}
-            # Make the POST request
-            start_time = time.monotonic()
-            response = requests.post(url, headers=headers, data=json.dumps(data))
-            total_request_time = time.monotonic() - start_time
-            response_code = response.status_code
-            response.raise_for_status()
-            # output from the endpoint is in the form:
-            # {"predictions": ["Input: ... \nOutput:\n ..."]}
-            generated_text = response.json()["predictions"][0].split("\nOutput:\n")[1]
-            tokens_received = len(self.tokenizer.encode(generated_text))
-            ttft = -1
-            output_throughput = tokens_received / total_request_time
-            time_to_next_token = [
-                total_request_time / tokens_received for _ in range(tokens_received)
-            ]
-        except Exception as e:
-            metrics[common_metrics.ERROR_MSG] = str(e)
-            metrics[common_metrics.ERROR_CODE] = response_code
-            print(f"Warning Or Error: {e}")
-            print(response_code)
-            print(response_code)
-        metrics[common_metrics.INTER_TOKEN_LAT] = time_to_next_token
-        metrics[common_metrics.TTFT] = ttft
-        metrics[common_metrics.E2E_LAT] = total_request_time
-        metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput
-        metrics[common_metrics.NUM_TOTAL_TOKENS] = tokens_received + prompt_len
-        metrics[common_metrics.NUM_OUTPUT_TOKENS] = tokens_received
-        metrics[common_metrics.NUM_INPUT_TOKENS] = prompt_len
-        return metrics, generated_text, request_config
-if __name__ == "__main__":
-    # Run these before hand:
-    # gcloud auth application-default login
-    # gcloud config set project YOUR_PROJECT_ID
-    # export GCLOUD_ACCESS_TOKEN=$(gcloud auth print-access-token)
-    # export GCLOUD_PROJECT_ID=YOUR_PROJECT_ID
-    # export GCLOUD_REGION=YOUR_REGION
-    # export VERTEXAI_ENDPOINT_ID=YOUR_ENDPOINT_ID
-    client = VertexAIClient.remote()
-    request_config = RequestConfig(
-        prompt=("Give me ten interview questions for the role of program manager.", 10),
-        model="gpt3",
-        sampling_params={
-            "temperature": 0.2,
-            "max_new_tokens": 256,
-            "top_k": 40,
-            "top_p": 0.95,
-        },
-    )
-    ray.get(client.llm_request.remote(request_config))

llmperf/src/llmperf/ray_llm_client.py DELETED Viewed

@@ -1,22 +0,0 @@
-import abc
-from typing import Any, Dict, Tuple
-from llmperf.models import RequestConfig
-class LLMClient:
-    """A client for making requests to a LLM API e.g Anyscale Endpoints."""
-    @abc.abstractmethod
-    def llm_request(
-        self, request_config: RequestConfig
-    ) -> Tuple[Dict[str, Any], str, RequestConfig]:
-        """Make a single completion request to a LLM API
-        Returns:
-            Metrics about the performance charateristics of the request.
-            The text generated by the request to the LLM API.
-            The request_config used to make the request. This is mainly for logging purposes.
-        """
-        ...

llmperf/src/llmperf/requests_launcher.py DELETED Viewed

@@ -1,48 +0,0 @@
-from typing import Any, List
-from llmperf.ray_llm_client import LLMClient
-from llmperf.models import RequestConfig
-from ray.util import ActorPool
-class RequestsLauncher:
-    """Launch requests from LLMClients to their respective LLM APIs."""
-    def __init__(self, llm_clients: List[LLMClient]):
-        self._llm_client_pool = ActorPool(llm_clients)
-    def launch_requests(self, request_config: RequestConfig) -> None:
-        """Launch requests to the LLM API.
-        Args:
-            request_config: The configuration for the request.
-        """
-        if self._llm_client_pool.has_free():
-            self._llm_client_pool.submit(
-                lambda client, _request_config: client.llm_request.remote(
-                    _request_config
-                ),
-                request_config,
-            )
-    def get_next_ready(self, block: bool = False) -> List[Any]:
-        """Return results that are ready from completed requests.
-        Args:
-            block: Whether to block until a result is ready.
-        Returns:
-            A list of results that are ready.
-        """
-        results = []
-        if not block:
-            while self._llm_client_pool.has_next():
-                results.append(self._llm_client_pool.get_next_unordered())
-        else:
-            while not self._llm_client_pool.has_next():
-                pass
-            while self._llm_client_pool.has_next():
-                results.append(self._llm_client_pool.get_next_unordered())
-        return results

llmperf/src/llmperf/sonnet.txt DELETED Viewed

@@ -1,84 +0,0 @@
-Shall I compare thee to a summer's day?
-Thou art more lovely and more temperate:
-Rough winds do shake the darling buds of May,
-And summer's lease hath all too short a date:
-Sometime too hot the eye of heaven shines,
-And often is his gold complexion dimm'd;
-And every fair from fair sometime declines,
-By chance or nature's changing course untrimm'd;
-But thy eternal summer shall not fade
-Nor lose possession of that fair thou owest;
-Nor shall Death brag thou wander'st in his shade,
-When in eternal lines to time thou growest:
-So long as men can breathe or eyes can see,
-So long lives this and this gives life to thee.
-Then let not winter's ragged hand deface
-In thee thy summer, ere thou be distill'd:
-Make sweet some vial; treasure thou some place
-With beauty's treasure, ere it be self-kill'd.
-That use is not forbidden usury,
-Which happies those that pay the willing loan;
-That's for thyself to breed another thee,
-Or ten times happier, be it ten for one;
-Ten times thyself were happier than thou art,
-If ten of thine ten times refigured thee:
-Then what could death do, if thou shouldst depart,
-Leaving thee living in posterity?
-Be not self-will'd, for thou art much too fair
-To be death's conquest and make worms thine heir.
-Where art thou, Muse, that thou forget'st so long
-To speak of that which gives thee all thy might?
-Spend'st thou thy fury on some worthless song,
-Darkening thy power to lend base subjects light?
-Return, forgetful Muse, and straight redeem
-In gentle numbers time so idly spent;
-Sing to the ear that doth thy lays esteem
-And gives thy pen both skill and argument.
-Rise, resty Muse, my love's sweet face survey,
-If Time have any wrinkle graven there;
-If any, be a satire to decay,
-And make Time's spoils despised every where.
-Give my love fame faster than Time wastes life;
-So thou prevent'st his scythe and crooked knife.
-My glass shall not persuade me I am old,
-So long as youth and thou are of one date;
-But when in thee time's furrows I behold,
-Then look I death my days should expiate.
-For all that beauty that doth cover thee
-Is but the seemly raiment of my heart,
-Which in thy breast doth live, as thine in me:
-How can I then be elder than thou art?
-O, therefore, love, be of thyself so wary
-As I, not for myself, but for thee will;
-Bearing thy heart, which I will keep so chary
-As tender nurse her babe from faring ill.
-Presume not on thy heart when mine is slain;
-Thou gavest me thine, not to give back again.
-So am I as the rich, whose blessed key
-Can bring him to his sweet up-locked treasure,
-The which he will not every hour survey,
-For blunting the fine point of seldom pleasure.
-Therefore are feasts so solemn and so rare,
-Since, seldom coming, in the long year set,
-Like stones of worth they thinly placed are,
-Or captain jewels in the carcanet.
-So is the time that keeps you as my chest,
-Or as the wardrobe which the robe doth hide,
-To make some special instant special blest,
-By new unfolding his imprison'd pride.
-Blessed are you, whose worthiness gives scope,
-Being had, to triumph, being lack'd, to hope.
-If there be nothing new, but that which is
-Hath been before, how are our brains beguiled,
-Which, labouring for invention, bear amiss
-The second burden of a former child!
-O, that record could with a backward look,
-Even of five hundred courses of the sun,
-Show me your image in some antique book,
-Since mind at first in character was done!
-That I might see what the old world could say
-To this composed wonder of your frame;
-Whether we are mended, or whether better they,
-Or whether revolution be the same.
-O, sure I am, the wits of former days
-To subjects worse have given admiring praise.

llmperf/src/llmperf/utils.py DELETED Viewed

@@ -1,147 +0,0 @@
-import json
-import math
-import pathlib
-import random
-import subprocess
-import time
-from typing import Any, Dict, Tuple
-from transformers import LlamaTokenizerFast
-RESULTS_VERSION = "2023-08-31"
-class LLMPerfResults:
-    def __init__(
-        self,
-        name: str,
-        metadata: Dict[str, Any] = None,
-    ):
-        self.name = name
-        self.metadata = metadata or {}
-        self.timestamp = int(time.time())
-        self.metadata["timestamp"] = self.timestamp
-        self.version = RESULTS_VERSION
-    def to_dict(self):
-        data = {
-            "version": self.version,
-            "name": self.name,
-        }
-        data.update(self.metadata)
-        data = flatten_dict(data)
-        return data
-    def json(self):
-        data = self.to_dict()
-        return json.dumps(data)
-def upload_to_s3(results_path: str, s3_path: str) -> None:
-    """Upload the results to s3.
-    Args:
-        results_path: The path to the results file.
-        s3_path: The s3 path to upload the results to.
-    """
-    command = ["aws", "s3", "sync", results_path, f"{s3_path}/"]
-    result = subprocess.run(command)
-    if result.returncode == 0:
-        print("Files uploaded successfully!")
-    else:
-        print("An error occurred:")
-        print(result.stderr)
-def randomly_sample_sonnet_lines_prompt(
-    prompt_tokens_mean: int = 550,
-    prompt_tokens_stddev: int = 250,
-    expect_output_tokens: int = 150,
-) -> Tuple[str, int]:
-    """Generate a prompt that randomly samples lines from a the shakespeare sonnet at sonnet.txt.
-    Args:
-        prompt_length_mean: The mean length of the prompt to generate.
-        prompt_len_stddev: The standard deviation of the length of the prompt to generate.
-        expect_output_tokens: The number of tokens to expect in the output. This is used to
-        determine the length of the prompt. The prompt will be generated such that the output
-        will be approximately this many tokens.
-    Note:
-        tokens will be counted from the sonnet using the Llama tokenizer. Using one tokenizer
-        ensures a fairer comparison across different LLMs. For example, if gpt 3.5 tokenizes
-        a prompt in less tokens than Llama2, then this will be reflected in the results since
-        they will be fed identical prompts.
-    Returns:
-        A tuple of the prompt and the length of the prompt.
-    """
-    tokenizer = LlamaTokenizerFast.from_pretrained(
-        "hf-internal-testing/llama-tokenizer"
-    )
-    get_token_length = lambda text: len(tokenizer.encode(text))
-    prompt = (
-        "Randomly stream lines from the following text "
-        f"with {expect_output_tokens} output tokens. "
-        "Don't generate eos tokens:\n\n"
-    )
-    # get a prompt length that is at least as long as the base
-    num_prompt_tokens = sample_random_positive_int(
-        prompt_tokens_mean, prompt_tokens_stddev
-    )
-    while num_prompt_tokens < get_token_length(prompt):
-        num_prompt_tokens = sample_random_positive_int(
-            prompt_tokens_mean, prompt_tokens_stddev
-        )
-    remaining_prompt_tokens = num_prompt_tokens - get_token_length(prompt)
-    sonnet_path = pathlib.Path(__file__).parent.resolve() / "sonnet.txt"
-    with open(sonnet_path, "r") as f:
-        sonnet_lines = f.readlines()
-    random.shuffle(sonnet_lines)
-    sampling_lines = True
-    while sampling_lines:
-        for line in sonnet_lines:
-            line_to_add = line
-            if remaining_prompt_tokens - get_token_length(line_to_add) < 0:
-                # This will cut off a line in the middle of a word, but that's ok since an
-                # llm should be able to handle that.
-                line_to_add = line_to_add[: int(math.ceil(remaining_prompt_tokens))]
-                sampling_lines = False
-                prompt += line_to_add
-                break
-            prompt += line_to_add
-            remaining_prompt_tokens -= get_token_length(line_to_add)
-    return (prompt, num_prompt_tokens)
-def sample_random_positive_int(mean: int, stddev: int) -> int:
-    """Sample random numbers from a gaussian distribution until a positive number is sampled.
-    Args:
-        mean: The mean of the gaussian distribution to sample from.
-        stddev: The standard deviation of the gaussian distribution to sample from.
-    Returns:
-        A random positive integer sampled from the gaussian distribution.
-    """
-    ret = -1
-    while ret <= 0:
-        ret = int(random.gauss(mean, stddev))
-    return ret
-def flatten_dict(d, parent_key="", sep="_"):
-    items = []
-    for k, v in d.items():
-        new_key = f"{parent_key}{sep}{k}" if parent_key else k
-        if isinstance(v, dict):
-            items.extend(flatten_dict(v, new_key, sep=sep).items())
-        else:
-            items.append((new_key, v))
-    return dict(items)

llmperf/token_benchmark_ray.py DELETED Viewed

@@ -1,469 +0,0 @@
-import argparse
-from collections.abc import Iterable
-import json
-import os
-from pathlib import Path
-import re
-import time
-import random
-from typing import Any, Dict, List, Optional, Tuple
-import pandas as pd
-import ray
-from llmperf import common_metrics
-from llmperf.common import SUPPORTED_APIS, construct_clients
-from llmperf.models import RequestConfig
-from llmperf.requests_launcher import RequestsLauncher
-from llmperf.utils import (
-    randomly_sample_sonnet_lines_prompt,
-    LLMPerfResults,
-    sample_random_positive_int,
-)
-from tqdm import tqdm
-from transformers import LlamaTokenizerFast
-def get_token_throughput_latencies(
-    model: str,
-    mean_input_tokens: int,
-    stddev_input_tokens: int,
-    mean_output_tokens: int,
-    stddev_output_tokens: int,
-    additional_sampling_params: Optional[Dict[str, Any]] = None,
-    num_concurrent_requests: int = 1,
-    max_num_completed_requests: int = 500,
-    test_timeout_s=90,
-    llm_api="openai",
-) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
-    """Get the token throughput and latencies for the given model.
-    Args:
-        model: The name of the model to query.
-        mean_input_tokens: The mean number of tokens to send in the prompt for the request.
-        stddev_input_tokens: The standard deviation of the number of tokens to send in the prompt for the request.
-        mean_output_tokens: The mean number of tokens to generate per request.
-        stddev_output_tokens: The standard deviation of the number of tokens to generate per request.
-        additional_sampling_params: Additional sampling parameters to send with the request.
-            For more information see the LLM APIs documentation for the completions
-        num_concurrent_requests: The number of concurrent requests to make. Increase
-            this to increase the amount of load and vice versa.
-        test_timeout_s: The amount of time to run the test for before reporting results.
-        llm_api: The name of the llm api to use. Either "openai" or "litellm".
-    Returns:
-        A summary of the performance metrics collected across all completed requests
-        (e.g. throughput, latencies, etc.)
-        The individual metrics for each request.
-    """
-    random.seed(11111)
-    tokenizer = LlamaTokenizerFast.from_pretrained(
-        "hf-internal-testing/llama-tokenizer"
-    )
-    get_token_length = lambda text: len(tokenizer.encode(text))
-    if not additional_sampling_params:
-        additional_sampling_params = {}
-    clients = construct_clients(llm_api=llm_api, num_clients=num_concurrent_requests)
-    req_launcher = RequestsLauncher(clients)
-    completed_requests = []
-    num_completed_requests = 0
-    start_time = time.monotonic()
-    iter = 0
-    pbar = tqdm(total=max_num_completed_requests)
-    while (
-        time.monotonic() - start_time < test_timeout_s
-        and len(completed_requests) < max_num_completed_requests
-    ):
-        iter += 1
-        num_output_tokens = sample_random_positive_int(
-            mean_output_tokens, stddev_output_tokens
-        )
-        prompt = randomly_sample_sonnet_lines_prompt(
-            prompt_tokens_mean=mean_input_tokens,
-            prompt_tokens_stddev=stddev_input_tokens,
-            expect_output_tokens=num_output_tokens,
-        )
-        default_sampling_params = {"max_tokens": num_output_tokens}
-        default_sampling_params.update(additional_sampling_params)
-        request_config = RequestConfig(
-            model=model,
-            prompt=prompt,
-            sampling_params=default_sampling_params,
-            llm_api=llm_api,
-        )
-        req_launcher.launch_requests(request_config)
-        # Retrieving results less frequently allows for more concurrent requests
-        # to be launched. This will overall reduce the amount of time it takes
-        # for the test to run.
-        if not (iter % num_concurrent_requests):
-            outs = req_launcher.get_next_ready()
-            all_metrics = []
-            for out in outs:
-                request_metrics, gen_text, _ = out
-                num_output_tokens = get_token_length(gen_text)
-                if num_output_tokens:
-                    request_metrics[common_metrics.INTER_TOKEN_LAT] /= num_output_tokens
-                else:
-                    request_metrics[common_metrics.INTER_TOKEN_LAT] = 0
-                request_metrics[common_metrics.NUM_OUTPUT_TOKENS] = num_output_tokens
-                request_metrics[common_metrics.NUM_TOTAL_TOKENS] = request_metrics[common_metrics.NUM_INPUT_TOKENS] + num_output_tokens
-                request_metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = num_output_tokens / request_metrics[common_metrics.E2E_LAT]
-                all_metrics.append(request_metrics)
-            completed_requests.extend(all_metrics)
-        pbar.update(len(completed_requests) - num_completed_requests)
-        num_completed_requests = len(completed_requests)
-    pbar.close()
-    end_time = time.monotonic()
-    if end_time - start_time >= test_timeout_s:
-        print("Test timed out before all requests could be completed.")
-    # check one last time that there are no remaining results to collect.
-    outs = req_launcher.get_next_ready()
-    all_metrics = []
-    for out in outs:
-        request_metrics, gen_text, _ = out
-        num_output_tokens = get_token_length(gen_text)
-        if num_output_tokens:
-            request_metrics[common_metrics.INTER_TOKEN_LAT] /= num_output_tokens
-        else:
-            request_metrics[common_metrics.INTER_TOKEN_LAT] = 0
-        request_metrics[common_metrics.NUM_OUTPUT_TOKENS] = num_output_tokens
-        request_metrics[common_metrics.NUM_TOTAL_TOKENS] = request_metrics[common_metrics.NUM_INPUT_TOKENS] + num_output_tokens
-        request_metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = num_output_tokens / request_metrics[common_metrics.E2E_LAT]
-        all_metrics.append(request_metrics)
-    completed_requests.extend(all_metrics)
-    print(f"\Results for token benchmark for {model} queried with the {llm_api} api.\n")
-    ret = metrics_summary(completed_requests, start_time, end_time)
-    metadata = {
-        "model": model,
-        "mean_input_tokens": mean_input_tokens,
-        "stddev_input_tokens": stddev_input_tokens,
-        "mean_output_tokens": mean_output_tokens,
-        "stddev_output_tokens": stddev_output_tokens,
-        "num_concurrent_requests": num_concurrent_requests,
-        "additional_sampling_params": additional_sampling_params,
-    }
-    metadata["results"] = ret
-    return metadata, completed_requests
-def metrics_summary(
-    metrics: List[Dict[str, Any]], start_time: int, end_time: int
-) -> Dict[str, Any]:
-    """Generate a summary over metrics generated from potentially multiple instances of this client.
-    Args:
-        metrics: The metrics to summarize.
-        start_time: The time the test started.
-        end_time: The time the test ended.
-    Returns:
-        A summary with the following information:
-            - Overall throughput (generated tokens / total test time)
-            - Number of completed requests
-            - Error rate
-            - Error code frequency
-            - Quantiles (p25-p99) for the following metrics:
-                - Inter token latency
-                - Time to first token
-                - User total request time
-                - Number of tokens processed per request
-                - Number of tokens generated per request
-                - User throughput (tokens / s)
-    """
-    ret = {}
-    def flatten(item):
-        for sub_item in item:
-            if isinstance(sub_item, Iterable) and not isinstance(sub_item, str):
-                yield from flatten(sub_item)
-            else:
-                yield sub_item
-    df = pd.DataFrame(metrics)
-    df_without_errored_req = df[df[common_metrics.ERROR_CODE].isna()]
-    for key in [
-        common_metrics.INTER_TOKEN_LAT,
-        common_metrics.TTFT,
-        common_metrics.E2E_LAT,
-        common_metrics.REQ_OUTPUT_THROUGHPUT,
-        common_metrics.NUM_INPUT_TOKENS,
-        common_metrics.NUM_OUTPUT_TOKENS
-    ]:
-        print(key)
-        ret[key] = {}
-        series = pd.Series(list(flatten(df_without_errored_req[key]))).dropna()
-        quantiles = series.quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).to_dict()
-        quantiles_reformatted_keys = {}
-        for quantile, value in quantiles.items():
-            reformatted_key = f"p{int(quantile * 100)}"
-            print(f"    {reformatted_key} = {value}")
-            quantiles_reformatted_keys[reformatted_key] = value
-        ret[key]["quantiles"] = quantiles_reformatted_keys
-        mean = series.mean()
-        print(f"    mean = {mean}")
-        ret[key]["mean"] = mean
-        print(f"    min = {series.min()}")
-        ret[key]["min"] = series.min()
-        print(f"    max = {series.max()}")
-        ret[key]["max"] = series.max()
-        print(f"    stddev = {series.std()}")
-        ret[key]["stddev"] = series.std()
-    ret[common_metrics.NUM_REQ_STARTED] = len(metrics)
-    error_codes = df[common_metrics.ERROR_CODE].dropna()
-    num_errors = len(error_codes)
-    ret[common_metrics.ERROR_RATE] = num_errors / len(metrics) if len(metrics) else 0
-    ret[common_metrics.NUM_ERRORS] = num_errors
-    print(f"Number Of Errored Requests: {num_errors}")
-    error_code_frequency = dict(error_codes.value_counts())
-    if num_errors:
-        error_code_frequency = dict(error_codes.value_counts())
-        print("Error Code Frequency")
-        print(error_code_frequency)
-    ret[common_metrics.ERROR_CODE_FREQ] = str(error_code_frequency)
-    overall_output_throughput = df_without_errored_req[
-        common_metrics.NUM_OUTPUT_TOKENS
-    ].sum() / (end_time - start_time)
-    print(f"Overall Output Throughput: {overall_output_throughput}")
-    ret[common_metrics.OUTPUT_THROUGHPUT] = overall_output_throughput
-    num_completed_requests = len(df_without_errored_req)
-    num_completed_requests_per_min = (
-        num_completed_requests / (end_time - start_time) * 60
-    )
-    print(f"Number Of Completed Requests: {num_completed_requests}")
-    print(f"Completed Requests Per Minute: {num_completed_requests_per_min}")
-    ret[common_metrics.NUM_COMPLETED_REQUESTS] = num_completed_requests
-    ret[common_metrics.COMPLETED_REQUESTS_PER_MIN] = num_completed_requests_per_min
-    return ret
-def run_token_benchmark(
-    llm_api: str,
-    model: str,
-    test_timeout_s: int,
-    max_num_completed_requests: int,
-    num_concurrent_requests: int,
-    mean_input_tokens: int,
-    stddev_input_tokens: int,
-    mean_output_tokens: int,
-    stddev_output_tokens: int,
-    additional_sampling_params: str,
-    results_dir: str,
-    user_metadata: Dict[str, Any],
-):
-    """
-    Args:
-        llm_api: The name of the llm api to use.
-        model: The name of the model to query.
-        max_num_completed_requests: The number of requests to complete before finishing the test.
-        test_timeout_s: The amount of time to run the test for before reporting results.
-        num_concurrent_requests: The number of concurrent requests to make. Increase
-            this to increase the amount of load and vice versa.
-        mean_input_tokens: The mean number of tokens to send in the prompt for the request.
-        stddev_input_tokens: The standard deviation of the number of tokens to send in the prompt for the request.
-        mean_output_tokens: The mean number of tokens to generate per request.
-        stddev_output_tokens: The standard deviation of the number of tokens to generate per request.
-        additional_sampling_params: Additional sampling parameters to send with the request.
-            For more information see the LLM APIs documentation for the completions.
-        results_dir: The directory to save the results to.
-        user_metadata: Additional metadata to include in the results.
-    """
-    if mean_input_tokens < 40:
-        print(
-            "the minimum number of input tokens that will be sent is 41"
-            " because of the prompting logic right now"
-        )
-    summary, individual_responses = get_token_throughput_latencies(
-        model=model,
-        llm_api=llm_api,
-        test_timeout_s=test_timeout_s,
-        max_num_completed_requests=max_num_completed_requests,
-        mean_input_tokens=mean_input_tokens,
-        stddev_input_tokens=stddev_input_tokens,
-        mean_output_tokens=mean_output_tokens,
-        stddev_output_tokens=stddev_output_tokens,
-        num_concurrent_requests=num_concurrent_requests,
-        additional_sampling_params=json.loads(additional_sampling_params),
-    )
-    if results_dir:
-        filename = f"{model}_{mean_input_tokens}_{mean_output_tokens}"
-        filename = re.sub(r"[^\w\d-]+", "-", filename)
-        filename = re.sub(r"-{2,}", "-", filename)
-        summary_filename = f"{filename}_summary"
-        individual_responses_filename = f"{filename}_individual_responses"
-        # Update to metadata.
-        summary.update(user_metadata)
-        results = LLMPerfResults(name=summary_filename, metadata=summary)
-        results_dir = Path(results_dir)
-        if not results_dir.exists():
-            results_dir.mkdir(parents=True)
-        elif not results_dir.is_dir():
-            raise ValueError(f"{results_dir} is not a directory")
-        try:
-            with open(results_dir / f"{summary_filename}.json", "w") as f:
-                json.dump(results.to_dict(), f, indent=4, default=str)
-        except Exception as e:
-            print(results.to_dict())
-            raise e
-        try:
-            with open(results_dir / f"{individual_responses_filename}.json", "w") as f:
-                json.dump(individual_responses, f, indent=4)
-        except Exception as e:
-            print(individual_responses)
-            raise e
-args = argparse.ArgumentParser(
-    description="Run a token throughput and latency benchmark."
-)
-args.add_argument(
-    "--model", type=str, required=True, help="The model to use for this load test."
-)
-args.add_argument(
-    "--mean-input-tokens",
-    type=int,
-    default=550,
-    help=(
-        "The mean number of tokens to send in the prompt for the request. "
-        " (default: %(default)s)"
-    ),
-)
-args.add_argument(
-    "--stddev-input-tokens",
-    type=int,
-    default=150,
-    help=(
-        "The standard deviation of number of tokens to send in the prompt for the request. "
-        "(default: %(default)s)"
-    ),
-)
-args.add_argument(
-    "--mean-output-tokens",
-    type=int,
-    default=150,
-    help=(
-        "The mean number of tokens to generate from each llm request. This is the max_tokens param "
-        "for the completions API. Note that this is not always the number of tokens returned. "
-        "(default: %(default)s)"
-    ),
-)
-args.add_argument(
-    "--stddev-output-tokens",
-    type=int,
-    default=80,
-    help=(
-        "The stdandard deviation on the number of tokens to generate per llm request. "
-        "(default: %(default)s)"
-    ),
-)
-args.add_argument(
-    "--num-concurrent-requests",
-    type=int,
-    default=10,
-    help=("The number of concurrent requests to send (default: %(default)s)"),
-)
-args.add_argument(
-    "--timeout",
-    type=int,
-    default=90,
-    help="The amount of time to run the load test for. (default: %(default)s)",
-)
-args.add_argument(
-    "--max-num-completed-requests",
-    type=int,
-    default=10,
-    help=(
-        "The number of requests to complete before finishing the test. Note "
-        "that its possible for the test to timeout first. (default: %(default)s)"
-    ),
-)
-args.add_argument(
-    "--additional-sampling-params",
-    type=str,
-    default="{}",
-    help=(
-        "Additional sampling params to send with the each request to the LLM API. "
-        "(default: %(default)s) No additional sampling params are sent."
-    ),
-)
-args.add_argument(
-    "--results-dir",
-    type=str,
-    default="",
-    help=(
-        "The directory to save the results to. "
-        "(`default: %(default)s`) No results are saved)"
-    ),
-)
-args.add_argument(
-    "--llm-api",
-    type=str,
-    default="openai",
-    help=(
-        f"The name of the llm api to use. Can select from {SUPPORTED_APIS}"
-        " (default: %(default)s)"
-    ),
-)
-args.add_argument(
-    "--metadata",
-    type=str,
-    default="",
-    help=(
-        "A comma separated list of metadata to include in the results, e.g. "
-        "name=foo,bar=1. These will be added to the metadata field of the results. "
-    ),
-)
-if __name__ == "__main__":
-    env_vars = dict(os.environ)
-    ray.init(runtime_env={"env_vars": env_vars})
-    args = args.parse_args()
-    # Parse user metadata.
-    user_metadata = {}
-    if args.metadata:
-        for item in args.metadata.split(","):
-            key, value = item.split("=")
-            user_metadata[key] = value
-    run_token_benchmark(
-        llm_api=args.llm_api,
-        model=args.model,
-        test_timeout_s=args.timeout,
-        max_num_completed_requests=args.max_num_completed_requests,
-        mean_input_tokens=args.mean_input_tokens,
-        stddev_input_tokens=args.stddev_input_tokens,
-        mean_output_tokens=args.mean_output_tokens,
-        stddev_output_tokens=args.stddev_output_tokens,
-        num_concurrent_requests=args.num_concurrent_requests,
-        additional_sampling_params=args.additional_sampling_params,
-        results_dir=args.results_dir,
-        user_metadata=user_metadata,
-    )

on_startup.sh CHANGED Viewed

@@ -14,6 +14,12 @@ git config --global credential.helper store
 ## Remove the temporary clone directory
 #rm -rf /tmp/tgi-benchmark-notebooks
 # Add dark theme
 mkdir -p ~/.jupyter/lab/user-settings/@jupyterlab/apputils-extension/ && \
     echo '{ "theme":"JupyterLab Dark" }' >  ~/.jupyter/lab/user-settings/@jupyterlab/apputils-extension/themes.jupyterlab-settings

 ## Remove the temporary clone directory
 #rm -rf /tmp/tgi-benchmark-notebooks
+# Install llmperf
+cd ~/app
+git clone https://github.com/ray-project/llmperf.git
+cd llmperf
+git checkout afd137a
 # Add dark theme
 mkdir -p ~/.jupyter/lab/user-settings/@jupyterlab/apputils-extension/ && \
     echo '{ "theme":"JupyterLab Dark" }' >  ~/.jupyter/lab/user-settings/@jupyterlab/apputils-extension/themes.jupyterlab-settings

requirements.txt CHANGED Viewed

@@ -3,9 +3,10 @@ jupyterlab-vim==0.15.1
 jupyterlab-vimrc==0.5.2
 jupyter-server==2.3.0
 tornado==6.2
-ipywidgets
-git+https://github.com/ray-project/llmperf.git
-huggingface-hub
-transformers
-pandas
-datasets

 jupyterlab-vimrc==0.5.2
 jupyter-server==2.3.0
 tornado==6.2
+ipywidgets==8.1.3
+huggingface-hub==0.23.2
+transformers==4.41.2
+pandas==2.2.2
+datasets==2.19.1
+plotly==5.22.0
+ray[default]==2.23.0