执行器

disaggServerUtil.h

namespace tensorrt_llm

namespace executor

namespace disagg_executor

class DisaggExecutorOrchestrator

公共函数

DisaggExecutorOrchestrator(std::vector<std::filesystem::path> const &ctxEnginePaths, std::vector<std::filesystem::path> const &genEnginePaths, std::vector<执行器::ExecutorConfig> const &ctxExecutorConfigs, std::vector<执行器::ExecutorConfig> const &genExecutorConfigs, bool hasContextAwaitThreads, bool hasGenAwaitThreads)

构建一个DisaggExecutorOrchestrator对象。

Parameters:

ctxEnginePaths – 上下文引擎文件的文件路径向量。
genEnginePaths – 生成引擎文件的文件路径向量。
ctxExecutorConfigs – 上下文执行器的ExecutorConfig向量。
genExecutorConfigs – 用于生成执行器的ExecutorConfig向量。
hasContextAwaitThreads – 是否存在为每个生成执行器接收响应的线程。
hasGenAwaitThreads – 是否存在为每个生成执行器接收响应的线程。

std::vector<IdType> enqueueContext(std::vector<texec::Request> const &requests, std::optional<int> selectContextId = std::nullopt, bool batch = false)

将仅上下文的请求加入上下文执行器的队列。

Parameters:

requests – 仅包含上下文的请求向量。
selectContextId – 要使用的上下文执行器的索引。如果 std::nullopt，将使用具有最少未完成请求的执行器。
batch – 如果为true，则在相同的上下文执行器中排队请求。如果为false，将尝试为每个请求使用不同的执行器。

Returns:

一个全局请求ID的向量，对应于requests中请求的顺序，返回的ID可能与每个执行器中的请求ID不同。

void enqueueGeneration(std::vector<texec::Request> const &requests, std::vector<IdType> const &globalRequestIds, std::optional<int> selectGenIdx = std::nullopt, bool batch = false)

将仅生成请求加入生成执行器的队列。

Parameters:

requests – 一个仅用于生成的请求向量。
globalRequestIds – 一个全局请求ID的向量，对应于请求的顺序，并且必须是enqueueContext函数返回的ID。
selectGenIdx – 要使用的生成执行器的索引。如果 std::nullopt，将使用具有最少未完成请求的执行器。
batch – 如果为true，则在同一个生成执行器中排队请求。如果为false，将尝试为每个请求使用不同的执行器。

std::vector<ResponseWithId> awaitContextResponses(std::optional<std::chrono::milliseconds> const &timeout, std::optional<int> contextIdx = std::nullopt)

等待上下文响应。

Parameters:

timeout – 等待新响应的最长时间
contextIdx – 要使用的上下文执行器的索引。如果 std::nullopt，则在所有上下文执行器中返回准备好的响应，如果 hasContextAwaitThreads 为 true，则此参数必须为 std::nullopt。

Returns:

一个包含相应全局请求ID的响应向量

std::vector<ResponseWithId> awaitGenerationResponses(std::optional<std::chrono::milliseconds> const &timeout, std::optional<int> genIdx = std::nullopt)

等待生成响应。

Parameters:

timeout – 等待新响应的最长时间。
genIdx – 要使用的生成执行器的索引。如果 std::nullopt，则返回所有生成执行器中的就绪响应，如果 hasGenAwaitThreads 为 true，则此参数必须为 std::nullopt。

Returns:

一个包含相应全局请求ID的响应向量。

bool canEnqueue() const: 指示当前进程是否允许入队请求。

std::vector<std::unique_ptr<texec::Executor>> const &getContextExecutors() const: 获取上下文执行器。

std::vector<std::unique_ptr<texec::Executor>> const &getGenExecutors() const: 获取生成执行器。

~DisaggExecutorOrchestrator()

私有成员

std::unique_ptr<Impl> mImpl

struct ResponseWithId

公共函数

inline ResponseWithId(tensorrt_llm::执行器::响应 &&response, IdType gid)

inline ResponseWithId(tensorrt_llm::执行器::响应 const &response, IdType gid)

inline ResponseWithId(ResponseWithId &&other) noexcept

ResponseWithId(ResponseWithId const &other) = default

inline ResponseWithId &operator=(ResponseWithId &&other) noexcept

inline ResponseWithId &operator=(ResponseWithId const &other)

~ResponseWithId() = default

公共成员

tensorrt_llm::执行器::响应 response

IdType gid

executor.h

namespace tensorrt_llm

namespace batch_manager

namespace kv_cache_manager

namespace executor

类型定义

using RetentionPriority = SizeType32 

using KVCacheEventData = std::variant<KVCacheCreatedData, KVCacheStoredData, KVCacheRemovedData, KVCacheUpdatedData>

函数

char const *version() noexcept: TRT-LLM的版本。

class ContextPhaseParams

公共类型

using RequestIdType = std::uint64_t

公共函数

explicit ContextPhaseParams(VecTokens firstGenTokens, RequestIdType reqId)

ContextPhaseParams(VecTokens firstGenTokens, RequestIdType reqId, void *state)

ContextPhaseParams(ContextPhaseParams const&)

ContextPhaseParams(ContextPhaseParams&&) noexcept

ContextPhaseParams &operator=(ContextPhaseParams const&)

ContextPhaseParams &operator=(ContextPhaseParams&&) noexcept

~ContextPhaseParams()

bool operator==(ContextPhaseParams const&) const noexcept

VecTokens const &getFirstGenTokens() const & noexcept

VecTokens popFirstGenTokens() && noexcept

RequestIdType getReqId() const noexcept

void const *getState() const noexcept

void *getState() noexcept

void *releaseState() noexcept

私有类型

using StatePtr = std::unique_ptr<void, decltype(&deleter)>

私有成员

RequestIdType mReqId = {0}: 此请求对应于上下文阶段中的请求ID。

VecTokens mFirstGenTokens: 由上下文执行器生成的第一个令牌。

StatePtr mState = {nullptr, deleter}: 此请求的上下文阶段状态。

私有静态函数

static void deleter(void const *data)

朋友们

friend class Serialization

class DebugConfig

#include <executor.h>

用于调试输出的配置类。

公共函数

explicit DebugConfig(bool debugInputTensors = false, bool debugOutputTensors = false, StringVec debugTensorNames = {}, SizeType32 debugTensorsMaxIterations = 0)

bool operator==(DebugConfig const &other) const

bool getDebugInputTensors() const

bool getDebugOutputTensors() const

StringVec const &getDebugTensorNames() const

SizeType32 getDebugTensorsMaxIterations() const

void setDebugInputTensors(bool debugInputTensors)

void setDebugOutputTensors(bool debugOutputTensors)

void setDebugTensorNames(StringVec const &debugTensorNames)

void setDebugTensorsMaxIterations(SizeType32 debugTensorsMaxIterations)

私有类型

using StringVec = std::vector<std::string>

私有成员

bool mDebugInputTensors: 如果为真，调试所有输入张量。

bool mDebugOutputTensors: 如果为真，调试所有输出张量。

StringVec mDebugTensorNames: 如果不为空，仅调试此列表中的张量。

SizeType32 mDebugTensorsMaxIterations: 如果大于0，则最多为过去的debugTensorsMaxIterations次迭代提供调试张量，否则将它们转储到文件中。

朋友们

friend class Serialization

class DecodingConfig

#include <executor.h>

解码的配置类。

公共函数

explicit DecodingConfig(std::optional<解码模式> decodingMode = std::nullopt, std::optional<LookaheadDecodingConfig> lookaheadDecodingConfig = std::nullopt, std::optional<MedusaChoices> medusaChoices = std::nullopt, std::optional<EagleConfig> eagleConfig = std::nullopt)

bool operator==(DecodingConfig const &other) const

void setDecodingMode(解码模式 const&): 设置解码模式。某些模式需要使用它们自己的设置器。

std::optional<解码模式> getDecodingMode() const

void setLookaheadDecoding(LookaheadDecodingConfig const &lookaheadDecodingConfig): 设置前瞻解码模式和配置。

std::optional<LookaheadDecodingConfig> getLookaheadDecodingConfig() const

void setMedusaChoices(MedusaChoices const&): 设置medusa模式和配置。

std::optional<MedusaChoices> getMedusaChoices() const

void setEagleConfig(EagleConfig const&): 设置鹰眼模式和配置。

std::optional<EagleConfig> getEagleConfig() const

私有成员

std::optional<解码模式> mDecodingMode

std::optional<LookaheadDecodingConfig> mLookaheadDecodingConfig

std::optional<MedusaChoices> mMedusaChoices

std::optional<EagleConfig> mEagleConfig

朋友们

friend class Serialization

class DynamicBatchConfig

#include <executor.h>

用于动态调整批量大小和最大令牌数的配置类。在运行时，输入和输出长度的统计数据会被记录。基于这些统计数据，批量大小和最大令牌数会被动态调整，以更好地服务请求。

公共函数

explicit DynamicBatchConfig(bool enableBatchSizeTuning = false, bool enableMaxNumTokensTuning = false, SizeType32 dynamicBatchMovingAverageWindow = kDefaultDynamicBatchMovingAverageWindow, std::vector<std::pair<SizeType32, SizeType32>> batchSizeTable = kDefaultBatchSizeTable)

SizeType32 getDynamicBatchMovingAverageWindow() const

bool getEnableBatchSizeTuning() const

bool getEnableMaxNumTokensTuning() const

std::vector<std::pair<SizeType32, SizeType32>> getBatchSizeTable() const

公共静态属性

static SizeType32 const kDefaultDynamicBatchMovingAverageWindow = 128: 用于计算动态批量大小和最大令牌数的输入和输出长度的移动平均的默认窗口大小。

static std::vector<std::pair<SizeType32, SizeType32>> const kDefaultBatchSizeTable: 批量大小表的默认值。

私有成员

bool mEnableBatchSizeTuning: 控制是否应动态调整批量大小。

bool mEnableMaxNumTokensTuning: 控制是否应动态调整最大令牌数。

SizeType32 mDynamicBatchMovingAverageWindow: 用于计算动态批量大小和最大令牌数的输入和输出长度的移动平均窗口大小。

std::vector<std::pair<SizeType32, SizeType32>> mBatchSizeTable: 一个包含 (batchSizeLimit, batchSize) 的向量。当最大容量批处理大小小于时。

朋友们

friend class Serialization

struct EagleConfig

公共函数

explicit EagleConfig(std::optional<EagleChoices> eagleChoices = std::nullopt, bool greedySampling = true, std::optional<float> posteriorThreshold = std::nullopt)

bool operator==(EagleConfig const &other) const

std::optional<EagleChoices> getEagleChoices() const

std::optional<float> getPosteriorThreshold() const

bool isGreedySampling() const

私有函数

std::optional<float> const &checkPosteriorValue(std::optional<float> const &value)

私有成员

std::optional<EagleChoices> mEagleChoices: 为EAGLE-1形成选择树。

bool mGreedySampling: 标志用于使用贪婪或典型接受。

std::optional<float> mPosteriorThreshold: 典型接受的最小令牌概率。对应于https://arxiv.org/pdf/2401.10774中的epsilon。默认值为0.09f。

朋友们

friend class Serialization

class Executor

#include <executor.h>

执行器负责接收新请求和发送响应，并运行推理。

公共函数

Executor(std::filesystem::path const &modelPath, 模型类型 modelType, ExecutorConfig const &executorConfig)

Parameters:

modelPath – 定义要运行的模型的文件夹路径
modelType – 模型的类型
executorConfig – 执行器的配置
comm – 一个可选的进程间通信器配置

Executor(std::filesystem::path const &encoderModelPath, std::filesystem::path const &decoderModelPath, 模型类型 modelType, ExecutorConfig const &executorConfig)

Executor(BufferView const &engineBuffer, std::string const &jsonConfigStr, 模型类型 modelType, ExecutorConfig const &executorConfig, std::optional<std::map<std::string, 张量>> const &managedWeights = std::nullopt)

Executor(BufferView const &encoderEngineBuffer, std::string const &encoderJsonConfigStr, BufferView const &decoderEngineBuffer, std::string const &decoderJsonConfigStr, 模型类型 modelType, ExecutorConfig const &executorConfig)

Executor(std::shared_ptr<Model> model, ExecutorConfig const &executorConfig)

Executor(std::shared_ptr<Model> encoderModel, std::shared_ptr<Model> decoderModel, ExecutorConfig const &executorConfig)

~Executor()

Executor(执行器 const &executor) = delete

执行器 &operator=(执行器 const &executor) = delete

Executor(执行器&&) = default

执行器 &operator=(执行器&&) = default

IdType enqueueRequest(请求 const &request)

将新请求加入队列。

Parameters:: request – 包含输入标记和请求参数的LLM请求
Returns:: 标识请求的唯一ID

std::vector<IdType> enqueueRequests(std::vector<请求> const &requests): 将一批请求加入队列。

std::vector<响应> awaitResponses(std::optional<std::chrono::milliseconds> const &timeout = std::nullopt)

等待准备就绪的响应。

   此 重载 等待 任何 准备好的 响应。 特别是， 如果 有多个 请求
   已经 入队， 此 方法 将 提供 任何 准备好的 响应 而不保证 顺序。

Parameters:: timeout – 等待新响应的最长时间
Returns:: 响应向量

std::vector<响应> awaitResponses(IdType const &requestId, std::optional<std::chrono::milliseconds> const &timeout = std::nullopt)

等待准备就绪的响应。

Parameters:

id – 请求的ID
timeout – 等待新响应的最长时间

Returns:

响应向量

std::vector<std::vector<响应>> awaitResponses(std::vector<IdType> const &requestIds, std::optional<std::chrono::milliseconds> const &timeout = std::nullopt)

等待多个准备好的响应。

   A multiple ID request behaves as if awaitResponses(IdType, timeout)
   were invoked on all IDs. The returned vector contains
   a vector of responses per ID in the same order specified by the requestIds.
   The same behaviour as awaitResponses(IdType, timeout) applies:
   * Responses may be empty.
   * If all responses have already been given for one of the requestIds,
     then this method will hang unless a timeout is specified.

Parameters:

requestIds – 请求的Ids
timeout – 等待新响应的最长时间

Returns:

响应的向量向量

SizeType32 getNumResponsesReady(std::optional<IdType> const &requestId = std::nullopt) const

获取准备好的响应数量。

Parameters:: requestId – 一个可选的请求ID
Returns:: 准备好的响应数量

void cancelRequest(IdType requestId)

取消具有提供的请求ID的请求。

Parameters:: id – 要取消响应的请求ID

void shutdown()

向服务器发出关闭信号。

此调用是阻塞的。只有在所有请求都已终止或达到超时时才会返回

std::deque<迭代统计> getLatestIterationStats()

返回自上次调用getLatestIterationStats以来计算的每次迭代统计信息。最多包含iterStatsMaxIterations次迭代。

Returns:: 迭代统计

std::deque<RequestStatsPerIteration> getLatestRequestStats()

返回自上次调用getLatestRequestStats以来计算的每次迭代的请求统计信息。最多包含requestStatsMaxIterations次迭代。

Returns:: 请求按迭代分组的统计信息

std::deque<DebugTensorsPerIteration> getLatestDebugTensors()

返回自上次调用getLatestDebugTensors以来计算的每次迭代的调试张量。最多包含debugTensorsMaxIterations次迭代。

Returns:: 请求按迭代分组的调试张量

bool canEnqueueRequests() const: 指示当前进程是否允许入队请求。

bool isParticipant() const: 指示当前进程是否参与此执行器实例。

std::optional<std::shared_ptr<KVCacheEventManager>> getKVCacheEventManager() const

私有成员

std::unique_ptr<Impl> mImpl

class ExecutorConfig

#include <executor.h>

模型执行器的配置类。

公共函数

explicit ExecutorConfig(SizeType32 maxBeamWidth = 1, SchedulerConfig schedulerConfig = SchedulerConfig(), KvCacheConfig kvCacheConfig = KvCacheConfig(), bool enableChunkedContext = true, bool normalizeLogProbs = true, SizeType32 iterStatsMaxIterations = kDefaultIterStatsMaxIterations, SizeType32 requestStatsMaxIterations = kDefaultRequestStatsMaxIterations, BatchingType batchingType = BatchingType::kINFLIGHT, std::optional<SizeType32> maxBatchSize = std::nullopt, std::optional<SizeType32> maxNumTokens = std::nullopt, std::optional<ParallelConfig> parallelConfig = std::nullopt, std::optional<PeftCacheConfig> const &peftCacheConfig = std::nullopt, std::optional<LogitsPostProcessorConfig> logitsPostProcessorConfig = std::nullopt, std::optional<DecodingConfig> decodingConfig = std::nullopt, float gpuWeightsPercent = 1, std::optional<SizeType32> maxQueueSize = std::nullopt, ExtendedRuntimePerfKnobConfig const &extendedRuntimePerfKnobConfig = ExtendedRuntimePerfKnobConfig(), std::optional<DebugConfig> debugConfig = std::nullopt, SizeType32 recvPollPeriodMs = 0, uint64_t maxSeqIdleMicroseconds = kDefaultMaxSeqIdleMicroseconds, std::optional<SpeculativeDecodingConfig> specDecConfig = std::nullopt, std::optional<GuidedDecodingConfig> guidedDecodingConfig = std::nullopt)

SizeType32 getMaxBeamWidth() const

SchedulerConfig getSchedulerConfig() const

KvCacheConfig getKvCacheConfig() const

SchedulerConfig &getSchedulerConfigRef()

KvCacheConfig &getKvCacheConfigRef()

bool getEnableChunkedContext() const

bool getNormalizeLogProbs() const

SizeType32 getIterStatsMaxIterations() const

SizeType32 getRequestStatsMaxIterations() const

BatchingType getBatchingType() const

std::optional<SizeType32> getMaxBatchSize() const

std::optional<SizeType32> getMaxNumTokens() const

std::optional<ParallelConfig> getParallelConfig() const

std::optional<PeftCacheConfig> getPeftCacheConfig() const

std::optional<LogitsPostProcessorConfig> getLogitsPostProcessorConfig() const

std::optional<DecodingConfig> getDecodingConfig() const

float getGpuWeightsPercent() const

std::optional<SizeType32> getMaxQueueSize() const

ExtendedRuntimePerfKnobConfig getExtendedRuntimePerfKnobConfig() const

std::optional<DebugConfig> getDebugConfig() const

SizeType32 getRecvPollPeriodMs() const

uint64_t getMaxSeqIdleMicroseconds() const

std::optional<SpeculativeDecodingConfig> getSpecDecConfig() const

std::optional<GuidedDecodingConfig> getGuidedDecodingConfig() const

void setMaxBeamWidth(SizeType32 maxBeamWidth)

void setMaxBatchSize(SizeType32 maxBatchSize)

void setMaxNumTokens(SizeType32 maxNumTokens)

void setSchedulerConfig(SchedulerConfig const &schedulerConfig)

void setKvCacheConfig(KvCacheConfig const &kvCacheConfig)

void setEnableChunkedContext(bool enableChunkedContext)

void setNormalizeLogProbs(bool normalizeLogProbs)

void setIterStatsMaxIterations(SizeType32 iterStatsMaxIterations)

void setRequestStatsMaxIterations(SizeType32 requestStatsMaxIterations)

void setBatchingType(BatchingType batchingType)

void setParallelConfig(ParallelConfig const &parallelConfig)

void setPeftCacheConfig(PeftCacheConfig const &peftCacheConfig)

void setLogitsPostProcessorConfig(LogitsPostProcessorConfig const &logitsPostProcessorConfig)

void setDecodingConfig(DecodingConfig const &decodingConfig)

void setGpuWeightsPercent(float const &gpuWeightsPercent)

void setMaxQueueSize(std::optional<SizeType32> const &maxQueueSize)

void setExtendedRuntimePerfKnobConfig(ExtendedRuntimePerfKnobConfig const &extendedRuntimePerfKnobConfig)

void setDebugConfig(DebugConfig const &debugConfig)

void setRecvPollPeriodMs(SizeType32 const &recvPollPeriodMs)

void setMaxSeqIdleMicroseconds(uint64_t maxNumTokens)

void setSpecDecConfig(SpeculativeDecodingConfig const &specDecConfig)

void setGuidedDecodingConfig(GuidedDecodingConfig const &guidedDecodingConfig)

公共静态属性

static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds = 180000000

static constexpr SizeType32 kDefaultIterStatsMaxIterations = 1000

static constexpr SizeType32 kDefaultRequestStatsMaxIterations = 0

私有成员

SizeType32 mMaxBeamWidth: 将发送给执行器的请求的光束宽度值。

SchedulerConfig mSchedulerConfig: 调度器配置。

KvCacheConfig mKvCacheConfig: KV缓存配置。

bool mEnableChunkedContext: KV缓存配置。

bool mNormalizeLogProbs: 控制是否应对对数概率进行归一化。

SizeType32 mIterStatsMaxIterations: 控制保留统计数据的最大迭代次数。

SizeType32 mRequestStatsMaxIterations: 控制保留每个请求统计信息的最大迭代次数。

BatchingType mBatchingType: 使用的批处理策略类型。请参阅 BatchingType。

std::optional<SizeType32> mMaxBatchSize: 请求的最大批量大小。

std::optional<SizeType32> mMaxNumTokens: 每批的最大令牌数。

std::optional<ParallelConfig> mParallelConfig: 并行执行配置。

std::optional<PeftCacheConfig> mPeftCacheConfig

std::optional<LogitsPostProcessorConfig> mLogitsPostProcessorConfig: Logits 后处理器配置。

std::optional<DecodingConfig> mDecodingConfig: 解码配置。

float mGpuWeightsPercent: 用于权重流的GPU权重百分比。

std::optional<SizeType32> mMaxQueueSize: 在拒绝新请求之前，队列中允许的最大请求数。

ExtendedRuntimePerfKnobConfig mExtendedRuntimePerfKnobConfig: 可以在运行时设置的性能旋钮的配置。

std::optional<DebugConfig> mDebugConfig: 调试配置。

SizeType32 mRecvPollPeriodMs: 在orchestrator模式下轮询新通信的时间（以毫秒为单位）。使用0表示忙循环。

uint64_t mMaxSeqIdleMicroseconds: 计划请求在被终止前可以保持空闲的最大时间，以微秒为单位。默认值为3分钟。

std::optional<SpeculativeDecodingConfig> mSpeculativeDecodingConfig: 推测解码配置。

std::optional<GuidedDecodingConfig> mGuidedDecodingConfig: 引导解码配置。

朋友们

friend class Serialization

class ExtendedRuntimePerfKnobConfig

#include <executor.h>

运行时性能调节的配置类。

公共函数

explicit ExtendedRuntimePerfKnobConfig(bool multiBlockMode = true, bool enableContextFMHAFP32Acc = false, bool cudaGraphMode = false, SizeType32 cudaGraphCacheSize = 0)

inline bool operator==(ExtendedRuntimePerfKnobConfig const &other) const

bool getMultiBlockMode() const

bool getEnableContextFMHAFP32Acc() const

bool getCudaGraphMode() const

SizeType32 getCudaGraphCacheSize() const

void setMultiBlockMode(bool multiBlockMode)

void setEnableContextFMHAFP32Acc(bool enableContextFMHAFP32Acc)

void setCudaGraphMode(bool cudaGraphMode)

void setCudaGraphCacheSize(SizeType32 cacheSize)

私有成员

bool mMultiBlockMode: 控制是否应启用多块模式。

bool mEnableContextFMHAFP32Acc: 如果启用FMHA运行器FP32累加。

bool mCudaGraphMode: 控制是否启用CUDA图。

SizeType32 mCudaGraphCacheSize: 运行时缓存的cuda图数量。缓存越大，性能越好，但会消耗更多的GPU内存。

朋友们

friend class Serialization

class ExternalDraftTokensConfig

#include <executor.h>

使用外部草稿令牌进行推测解码的配置。允许包含草稿令牌、草稿逻辑并指定接受阈值。

公共函数

explicit ExternalDraftTokensConfig(VecTokens tokens, std::optional<张量> logits = std::nullopt, std::optional<FloatType> const &acceptanceThreshold = std::nullopt, std::optional<bool> const &fastLogits = std::nullopt)

VecTokens getTokens() const

std::optional<张量> getLogits() const

std::optional<FloatType> getAcceptanceThreshold() const

std::optional<bool> getFastLogits() const

私有成员

VecTokens mTokens: 草稿令牌。

std::optional<张量> mLogits: 草稿logits。预期形状：[num_draft_tokens, vocab_size]。

std::optional<FloatType> mAcceptanceThreshold: 接受阈值。必须大于 0.f 且小于等于 1.f。

std::optional<bool> mFastLogits: 使用直接传输来处理草稿logits。

朋友们

friend class Serialization

class GuidedDecodingConfig

#include <executor.h>

执行器的引导解码配置。

公共类型

enum class GuidedDecodingBackend

值：

enumerator kXGRAMMAR: 启用带有XGrammar后端的引导解码。

公共函数

explicit GuidedDecodingConfig(GuidedDecodingBackend backend, std::optional<std::vector<std::string>> encodedVocab = std::nullopt, std::optional<std::string> tokenizerStr = std::nullopt, std::optional<std::vector<TokenIdType>> stopTokenIds = std::nullopt)

bool operator==(GuidedDecodingConfig const &other) const

void setBackend(GuidedDecodingBackend const &backend)

GuidedDecodingBackend getBackend() const

void setEncodedVocab(std::vector<std::string> const &encodedVocab)

std::optional<std::vector<std::string>> getEncodedVocab() const

void setTokenizerStr(std::string const &tokenizerStr)

std::optional<std::string> getTokenizerStr() const

void setStopTokenIds(std::vector<TokenIdType> const &stopTokenIds)

std::optional<std::vector<TokenIdType>> getStopTokenIds() const

void validate() const

私有成员

GuidedDecodingBackend mBackend: 引导解码后端。目前支持XGrammar。

std::optional<std::vector<std::string>> mEncodedVocab

编码的词汇表。对于huggingface的分词器，可以通过以下方式提取：

encoded_vocab = tokenizer.get_vocab()
encoded_vocab = [token for token, _ in sorted(encoded_vocab.items(), key=lambda x: x[1])]

std::optional<std::string> mTokenizerStr

分词器字符串。对于huggingface快速分词器，可以通过以下方式提取：

tokenizer_str = tokenizer.backend_tokenizer.to_str()

std::optional<std::vector<TokenIdType>> mStopTokenIds: 停止标记ID。如果未提供，可以自动检测。

朋友们

friend class Serialization

class GuidedDecodingParams

#include <executor.h>

请求的引导解码参数。

公共类型

enum class GuideType

值：

enumerator kJSON: 生成的文本适合json格式。

enumerator kJSON_SCHEMA: 生成的文本适合json格式，并带有用户指定的额外限制，即schema。

enumerator kREGEX: 生成的文本符合用户指定的正则表达式。

enumerator kEBNF_GRAMMAR: 生成的文本符合用户指定的扩展巴科斯-瑙尔形式（EBNF）语法。EBNF语法广泛用于表达上下文无关语法。

公共函数

explicit GuidedDecodingParams(GuideType guideType, std::optional<std::string> guide = std::nullopt)

bool operator==(GuidedDecodingParams const &other) const

GuideType getGuideType() const

std::optional<std::string> getGuide() const

私有成员

GuideType mGuideType: 指南类型。参见GuideType。

std::optional<std::string> mGuide: 详细的指南字符串。它可以是json模式、正则表达式或EBNF语法，具体取决于mGuideType。

朋友们

friend class Serialization

class JsonSerialization

#include <executor.h>

包含实用函数的类，用于将统计信息序列化为json字符串。

公共静态函数

static std::string toJsonStr(迭代统计 const &iterationStats): 实用函数，用于将iterationStats结构转换为json序列化字符串。

static std::string toJsonStr(RequestStatsPerIteration const &requestStatsPerIter): 将requestStatsPerIteration结构体转换为json序列化字符串的实用函数。

static std::string toJsonStr(请求统计 const &requestStats): 实用函数，用于将requestStats结构体转换为json序列化字符串。

class KvCacheConfig

#include <executor.h>

KV缓存的配置类。

公共函数

explicit KvCacheConfig(bool enableBlockReuse = false, std::optional<SizeType32> const &maxTokens = std::nullopt, std::optional<std::vector<SizeType32>> const &maxAttentionWindowVec = std::nullopt, std::optional<SizeType32> const &sinkTokenLength = std::nullopt, std::optional<FloatType> const &freeGpuMemoryFraction = std::nullopt, std::optional<size_t> const &hostCacheSize = std::nullopt, bool onboardBlocks = true, std::optional<FloatType> const &crossKvCacheFraction = std::nullopt, std::optional<保留优先级> secondaryOffloadMinPriority = std::nullopt, size_t eventBufferMaxSize = 0, std::optional<tensorrt_llm::runtime::RuntimeDefaults> const &runtimeDefaults = std::nullopt)

bool getEnableBlockReuse() const

std::optional<SizeType32> getMaxTokens() const

std::optional<std::vector<SizeType32>> getMaxAttentionWindowVec() const

std::optional<SizeType32> getSinkTokenLength() const

std::optional<FloatType> getFreeGpuMemoryFraction() const

std::optional<FloatType> getCrossKvCacheFraction() const

std::optional<size_t> getHostCacheSize() const

bool getOnboardBlocks() const

std::optional<保留优先级> getSecondaryOffloadMinPriority() const

size_t getEventBufferMaxSize() const

void setEnableBlockReuse(bool enableBlockReuse)

void setMaxTokens(SizeType32 maxTokens)

void setMaxAttentionWindowVec(std::vector<SizeType32> maxAttentionWindowVec)

void setSinkTokenLength(SizeType32 sinkTokenLength)

void setFreeGpuMemoryFraction(FloatType freeGpuMemoryFraction)

void setCrossKvCacheFraction(FloatType crossKvCacheFraction)

void setHostCacheSize(size_t hostCacheSize)

void setOnboardBlocks(bool onboardBlocks)

void setSecondaryOffloadMinPriority(std::optional<保留优先级> secondaryOffloadMinPriority)

void setEventBufferMaxSize(size_t eventBufferMaxSize)

void fillEmptyFieldsFromRuntimeDefaults(tensorrt_llm::runtime::RuntimeDefaults runtimeDefaults)

私有成员

bool mEnableBlockReuse: 控制KV缓存块是否可以用于不同的请求。

std::optional<SizeType32> mMaxTokens: 应该存储在KV缓存中的最大令牌数。如果同时指定了mMaxTokens和mFreeGpuMemoryFraction，则将分配对应于最小值的存储器。

std::optional<std::vector<SizeType32>> mMaxAttentionWindowVec: 每个序列的注意力窗口大小。只有每个序列的最后mMaxAttentionWindow个标记会存储在KV缓存中。不同的层可能有不同的最大注意力窗口大小。如果mMaxAttentionWindowVec中的元素数量少于层数，mMaxAttentionWindowVec将被重复多次以达到层数。

std::optional<SizeType32> mSinkTokenLength: 下沉标记的数量（始终保持在注意力窗口中的标记）

std::optional<FloatType> mFreeGpuMemoryFraction: 应该为KV缓存分配的GPU内存比例。默认值为90%。如果同时指定了mMaxTokens和mFreeGpuMemoryFraction，则将分配对应于最小值的相应内存。

std::optional<FloatType> mCrossKvCacheFraction: KV缓存内存的一部分应保留用于交叉注意力。如果设置为p，自注意力将使用1-p的KV缓存内存，而交叉注意力将使用p的KV缓存内存。默认值为50%。仅在使用编码器-解码器模型时应设置此值。

std::optional<size_t> mHostCacheSize: 二级内存池的大小，以字节为单位。默认值为0。拥有二级内存池可以增加KV缓存块的复用潜力。

bool mOnboardBlocks: 控制卸载的块在重新使用之前是否应重新载入主内存。

std::optional<保留优先级> mSecondaryOffloadMinPriority: 只有优先级大于 mSecondaryOfflineMinPriority 的块才能卸载到辅助内存。

size_t mEventBufferMaxSize: KV缓存事件缓冲区的最大大小。

朋友们

friend class Serialization

struct KVCacheCreatedData

公共成员

std::vector<SizeType32> numBlocksPerCacheLevel: 每个缓存级别的块数量。

struct KVCacheEvent

公共函数

KVCacheEvent(IdType eventId, KVCacheEventData data)

公共成员

IdType eventId: 此事件的唯一标识符。

KVCacheEventData data: 此事件对应的数据。

template<typename T> struct KVCacheEventDiff

公共成员

T oldValue

T newValue

class KVCacheEventManager

#include <executor.h>

暴露了一组有限的KV缓存管理器功能。

公共函数

KVCacheEventManager(std::shared_ptr<tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager> kvCacheManager)

std::deque<KVCacheEvent> getLatestEvents(std::optional<std::chrono::milliseconds> timeout = std::nullopt)

获取最新的KV缓存事件。

Parameters:: timeout – 等待新事件的最长时间。如果为nullopt，则仅在有新事件可用或执行器实例关闭时返回。

私有成员

std::shared_ptr<tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager> kvCacheManager

struct KVCacheRemovedData

公共成员

std::vector<IdType> blockHashes: 正在移除的区块的哈希值。

class KvCacheRetentionConfig

#include <executor.h>

请求在KV缓存中的保留配置。

公共函数

inline explicit KvCacheRetentionConfig()

explicit KvCacheRetentionConfig(std::vector<TokenRangeRetentionConfig> const &tokenRangeRetentionPriorities, 保留优先级 decodeRetentionPriority = kDefaultRetentionPriority, std::optional<std::chrono::milliseconds> decodeDurationMs = std::nullopt)

std::vector<TokenRangeRetentionConfig> getTokenRangeRetentionConfigs() const

保留优先级 getDecodeRetentionPriority() const

std::optional<std::chrono::milliseconds> getDecodeDurationMs() const

std::vector<保留优先级和持续时间> getPerBlockRetentionPriorityDuration(SizeType32 blockSize, SizeType32 seqLen) const: 将令牌范围数据转换为每个kv块的条目。返回一个元组，其中包含与每个块的优先级和持续时间对应的向量。

公共静态属性

static constexpr 保留优先级 kMinRetentionPriority = 0

static constexpr 保留优先级 kMaxRetentionPriority = 100

static constexpr 保留优先级 kDefaultRetentionPriority = 35

私有成员

std::vector<TokenRangeRetentionConfig> mTokenRangeRetentionConfigs: 要更新的令牌范围和优先级级别。范围必须不重叠。例如 [(0, 64), (100, 128), (70, 80)] 是有效的，而 [(0, 64), (60, 128)] 则不是。

保留优先级 mDecodeRetentionPriority: 分配给解码阶段分配的块的优先级。

std::optional<std::chrono::milliseconds> mDecodeDurationMs: 解码块应保持其指定优先级级别的持续时间（以毫秒为单位）。

struct TokenRangeRetentionConfig

#include <executor.h>

设置令牌范围内块优先级的单个条目。较早的范围总是优先于较晚的范围。例如，块大小为16时，范围[0, 17]将应用于前两个块。

公共函数

inline explicit TokenRangeRetentionConfig(SizeType32 tokenStart, std::optional<SizeType32> tokenEnd = std::nullopt, 保留优先级 priority = KvCacheRetentionConfig::kDefaultRetentionPriority, std::optional<std::chrono::milliseconds> durationMs = std::nullopt)

inline bool operator==(TokenRangeRetentionConfig const &other) const

公共成员

SizeType32 tokenStart: 此范围的第一个标记。

std::optional<SizeType32> tokenEnd: 此范围的最后一个标记。结束不包括在范围内。可以将其设置为std::nullopt以将范围扩展到序列的末尾。

保留优先级 priority: 此令牌范围的优先级。优先级越高，越不容易被驱逐或卸载。

std::optional<std::chrono::milliseconds> durationMs: 块在给定优先级级别应保持的持续时间（以毫秒为单位）。设置为std::nullopt表示没有过期时间，并保持块在给定优先级级别，直到它被回收。持续时间过后，块将被移回kDefaultRetentionPriority级别。

struct KVCacheStoredBlockData

#include <executor.h>

存储在树中的单个块的条目。

公共函数

inline KVCacheStoredBlockData(IdType blockHash, tensorrt_llm::runtime::VecUniqueTokens tokens, tensorrt_llm::runtime::LoraTaskIdType loraId, SizeType32 cacheLevel, SizeType32 priority)

公共成员

IdType blockHash: 区块的哈希值。

tensorrt_llm::runtime::VecUniqueTokens tokens: 区块的唯一令牌。

tensorrt_llm::runtime::Lora任务ID类型 loraId: 区块的Lora任务ID。

SizeType32 cacheLevel: 块的缓存级别。

SizeType32 priority: 块的优先级。

struct KVCacheStoredData

公共成员

std::optional<IdType> parentHash: 此存储块序列的父级。

std::vector<KVCacheStoredBlockData> blocks: 一系列区块。区块i的父区块是区块i-1

struct KVCacheUpdatedData

公共函数

inline explicit KVCacheUpdatedData(IdType blockHash)

inline KVCacheUpdatedData &cacheLevelUpdated(SizeType32 oldValue, SizeType32 newValue)

inline KVCacheUpdatedData &priorityUpdated(SizeType32 oldValue, SizeType32 newValue)

公共成员

IdType blockHash: 更新后的区块的哈希值。

std::optional<KVCacheEventDiff<SizeType32>> cacheLevel = std::nullopt: cacheLevel 字段的更新值。

std::optional<KVCacheEventDiff<SizeType32>> priority = std::nullopt: 优先级字段的更新值。

class LogitsPostProcessorConfig

公共函数

explicit LogitsPostProcessorConfig(std::optional<LogitsPostProcessorMap> processorMap = std::nullopt, std::optional<LogitsPostProcessorBatched> processorBatched = std::nullopt, bool replicate = true)

std::optional<LogitsPostProcessorMap> getProcessorMap() const

std::optional<LogitsPostProcessorBatched> getProcessorBatched() const

bool getReplicate() const

void setProcessorMap(LogitsPostProcessorMap const &processorMap)

void setProcessorBatched(LogitsPostProcessorBatched const &processorBatched)

void setReplicate(bool replicate)

私有成员

std::optional<LogitsPostProcessorMap> mProcessorMap: 从后处理器名称到非批处理后处理器的映射

std::optional<LogitsPostProcessorBatched> mProcessorBatched: 单批处理的后处理器

bool mReplicate: 如果设置为true，logits后处理器将在最后一个PP等级的所有TP等级上运行。

struct LookaheadDecodingConfig

公共函数

LookaheadDecodingConfig(SizeType32 windowSize, SizeType32 ngramSize, SizeType32 verificationSetSize)

inline explicit LookaheadDecodingConfig()

bool operator==(LookaheadDecodingConfig const &other) const

std::tuple<SizeType32 const, SizeType32 const, SizeType32 const> get() const

SizeType32 getWindowSize() const

SizeType32 getNgramSize() const

SizeType32 getVerificationSetSize() const

std::tuple<SizeType32, SizeType32, SizeType32, SizeType32> calculateSpeculativeResource() const: 返回

bool isLE(LookaheadDecodingConfig const &that) const: 当this可以在that定义的资源上执行时返回true

公共静态函数

static bool isLegal(SizeType32 windowSize, SizeType32 ngramSize, SizeType32 verificationSetSize) noexcept: 当参数组合有效时返回true。

私有成员

SizeType32 mWindowSize

SizeType32 mNgramSize

SizeType32 mVerificationSetSize

朋友们

friend class Serialization

class LoraConfig

#include <executor.h>

LoRA的配置。

公共函数

explicit LoraConfig(IdType taskId, std::optional<张量> weights = std::nullopt, std::optional<张量> config = std::nullopt)

IdType getTaskId() const

std::optional<张量> getWeights() const

std::optional<张量> getConfig() const

私有成员

IdType mTaskId: Lora任务ID。

std::optional<张量> mWeights: Lora权重。有关预期形状和类型，请参阅TRT-LLM文档。

std::optional<张量> mConfig: Lora配置。有关配置张量的详细描述，请参阅TRT-LLM文档。

朋友们

friend class Serialization

class MropeConfig

#include <executor.h>

mrope的配置。

公共函数

explicit MropeConfig(张量 mropeRoratySinCos, SizeType32 mropePositionDeltas)

张量 getMRopeRotarySinCos() const

SizeType32 getMRopePositionDeltas() const

私有成员

张量 mMRopeRotarySinCos: mrope旋转正弦和余弦缓存。预期形状：[maxPositionEmbeddings*rotaryEmbeddingDim]，数据类型必须为float32。

SizeType32 mMRopePositionDeltas: mrope 位置变化量。

朋友们

friend class Serialization

class OrchestratorConfig

公共函数

explicit OrchestratorConfig(bool isOrchestrator = true, std::string workerExecutablePath = "", std::shared_ptr<mpi::MpiComm> orchLeaderComm = nullptr, bool spawnProcesses = true)

bool getIsOrchestrator() const

std::string getWorkerExecutablePath() const

std::shared_ptr<mpi::MpiComm> getOrchLeaderComm() const

bool getSpawnProcesses() const

void setIsOrchestrator(bool isOrchestrator)

void setWorkerExecutablePath(std::string const &workerExecutablePath)

void setOrchLeaderComm(std::shared_ptr<mpi::MpiComm> const &orchLeaderComm)

void setSpawnProcesses(bool spawnProcesses)

私有成员

bool mIsOrchestrator

std::string mWorkerExecutablePath

std::shared_ptr<mpi::MpiComm> mOrchLeaderComm

bool mSpawnProcesses

class OutputConfig

#include <executor.h>

控制结果输出的配置。

公共函数

explicit OutputConfig(bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false, bool excludeInputFromOutput = false, bool returnEncoderOutput = false, bool returnPerfMetrics = false)

公共成员

bool returnLogProbs: 控制是否结果应包含对数概率。默认值为false。

bool returnContextLogits: 控制是否结果应包含上下文logits。默认值为false。

bool returnGenerationLogits: 控制是否结果应包含生成的对数概率。默认值为false。

bool excludeInputFromOutput: 控制结果中的输出令牌是否应包含输入令牌。默认值为false。

bool returnEncoderOutput: 控制是否结果应包含编码器输出的隐藏状态（仅适用于编码器和编码器-解码器模型）。默认值为false。

bool returnPerfMetrics: 控制是否结果应包含性能指标。

class ParallelConfig

#include <executor.h>

用于并行执行参数的配置类目前仅支持 commType = CommunicationType::kMPI。

公共函数

explicit ParallelConfig(通信类型 commType = 通信类型::kMPI, 通信模式 commMode = 通信模式::kLEADER, std::optional<std::vector<SizeType32>> deviceIds = std::nullopt, std::optional<std::vector<SizeType32>> participantIds = std::nullopt, std::optional<OrchestratorConfig> const &orchestratorConfig = std::nullopt)

构造函数。

Parameters:

commType – 通信类型。参见 CommunicationType。
commMode – 通信模式。请参见CommunicationMode。
deviceIds – 参与模型执行的GPU的ID
participantIds – 参与模型执行的参与者ID（如果commType == kMPI，则为MPI等级）。第一个参与者被视为领导者。

通信类型 getCommunicationType() const

通信模式 getCommunicationMode() const

std::optional<std::vector<SizeType32>> getDeviceIds() const

std::optional<std::vector<SizeType32>> getParticipantIds() const

std::optional<OrchestratorConfig> getOrchestratorConfig() const

void setCommunicationType(通信类型 type)

void setCommunicationMode(通信模式 mode)

void setDeviceIds(std::vector<SizeType32> const &deviceIds)

void setParticipantIds(std::vector<SizeType32> const &participantIds)

void setOrchestratorConfig(OrchestratorConfig const &orchestratorConfig)

私有成员

通信类型 mCommType: 使用的通信协议类型。默认是MPI。

通信模式 mCommMode: 通信模式。参见CommunicationMode。

std::optional<std::vector<SizeType32>> mDeviceIds: 用于执行此模型的GPU设备ID。

std::optional<std::vector<SizeType32>> mParticipantIds: 用于执行此模型的参与者ID（例如MPI等级）。

std::optional<OrchestratorConfig> mOrchestratorConfig: 可选的编排器配置。

朋友们

friend class Serialization

class PeftCacheConfig

#include <executor.h>

PeftCacheManager 的配置

公共函数

explicit PeftCacheConfig(SizeType32 numHostModuleLayer = 0, SizeType32 numDeviceModuleLayer = 0, SizeType32 optimalAdapterSize = kDefaultOptimalAdapterSize, SizeType32 maxAdapterSize = kDefaultMaxAdapterSize, SizeType32 numPutWorkers = 1, SizeType32 numEnsureWorkers = 1, SizeType32 numCopyStreams = 1, SizeType32 maxPagesPerBlockHost = kDefaultMaxPagesPerBlockHost, SizeType32 maxPagesPerBlockDevice = kDefaultMaxPagesPerBlockDevice, std::optional<float> const &deviceCachePercent = std::nullopt, std::optional<size_t> const &hostCacheSize = std::nullopt)

bool operator==(PeftCacheConfig const &other) const

SizeType32 getNumHostModuleLayer() const

SizeType32 getNumDeviceModuleLayer() const

SizeType32 getOptimalAdapterSize() const

SizeType32 getMaxAdapterSize() const

SizeType32 getNumPutWorkers() const

SizeType32 getNumEnsureWorkers() const

SizeType32 getNumCopyStreams() const

SizeType32 getMaxPagesPerBlockHost() const

SizeType32 getMaxPagesPerBlockDevice() const

std::optional<float> getDeviceCachePercent() const

std::optional<size_t> getHostCacheSize() const

公共静态属性

static constexpr SizeType32 kDefaultOptimalAdapterSize = 8

static constexpr SizeType32 kDefaultMaxAdapterSize = 64

static constexpr SizeType32 kDefaultMaxPagesPerBlockHost = 24

static constexpr SizeType32 kDefaultMaxPagesPerBlockDevice = 8

私有成员

SizeType32 mNumHostModuleLayer

SizeType32 mNumDeviceModuleLayer

SizeType32 mOptimalAdapterSize

SizeType32 mMaxAdapterSize

SizeType32 mNumPutWorkers

SizeType32 mNumEnsureWorkers

SizeType32 mNumCopyStreams

SizeType32 mMaxPagesPerBlockHost

SizeType32 mMaxPagesPerBlockDevice

std::optional<FloatType> mDeviceCachePercent

std::optional<size_t> mHostCacheSize

朋友们

friend class Serialization

class PromptTuningConfig

#include <executor.h>

提示调优的配置。

公共函数

explicit PromptTuningConfig(张量 embeddingTable, std::optional<VecTokenExtraIds> inputTokenExtraIds = std::nullopt)

张量 getEmbeddingTable() const

std::optional<VecTokenExtraIds> getInputTokenExtraIds() const

私有成员

张量 mEmbeddingTable: 提示嵌入表。预期形状：[任务词汇量大小, 隐藏层大小]。数据类型必须与模型权重匹配。

std::optional<VecTokenExtraIds> mInputTokenExtraIds: 当启用p-tuning时，用于KV缓存重用的输入令牌额外ID。

朋友们

friend class Serialization

class Request

#include <executor.h>

一个包含请求信息的类。

公共函数

Request(VecTokens inputTokenIds, SizeType32 maxTokens, bool streaming = false, SamplingConfig const &samplingConfig = 采样配置(), 输出配置 const &outputConfig = 输出配置(), std::optional<SizeType32> const &endId = std::nullopt, std::optional<SizeType32> const &padId = std::nullopt, std::optional<std::vector<SizeType32>> positionIds = std::nullopt, std::optional<std::list<VecTokens>> badWords = std::nullopt, std::optional<std::list<VecTokens>> stopWords = std::nullopt, std::optional<张量> embeddingBias = std::nullopt, std::optional<ExternalDraftTokensConfig> externalDraftTokensConfig = std::nullopt, std::optional<PromptTuningConfig> pTuningConfig = std::nullopt, std::optional<MropeConfig> mRopeConfig = std::nullopt, std::optional<LoraConfig> loraConfig = std::nullopt, std::optional<LookaheadDecodingConfig> lookaheadConfig = std::nullopt, std::optional<KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt, std::optional<std::string> logitsPostProcessorName = std::nullopt, std::optional<VecTokens> encoderInputTokenIds = std::nullopt, std::optional<IdType> clientId = std::nullopt, bool returnAllGeneratedTokens = false, PriorityType priority = kDefaultPriority, 请求类型 type = 请求类型::REQUEST_TYPE_CONTEXT_AND_GENERATION, std::optional<ContextPhaseParams> contextPhaseParams = std::nullopt, std::optional<张量> encoderInputFeatures = std::nullopt, std::optional<SizeType32> encoderOutputLength = std::nullopt, std::optional<张量> crossAttentionMask = std::nullopt, SizeType32 numReturnSequences = 1, std::optional<EagleConfig> eagleConfig = std::nullopt, std::optional<张量> skipCrossAttnBlocks = std::nullopt, std::optional<GuidedDecodingParams> guidedDecodingParams = std::nullopt, std::optional<MillisecondsType> allottedTimeMs = std::nullopt)

请求构造函数。

Parameters:

inputTokenIds – 输入的令牌ID
maxTokens – 生成的最大令牌数
streaming – 表示响应是否应该被流式传输。默认值为 false。
samplingConfig – 采样配置
outputConfig – 输出配置
endId – 结束令牌ID
padId – 填充标记的ID
positionIds – 输入的职位ID
badWords – 一个不良词汇标记的列表。每个“词”可以由多个标记组成
stopWords – 一个停用词标记列表。每个“词”可以由多个标记组成
embeddingBias – 嵌入偏置张量。期望类型为kFP32，形状为[vocab_size]。
externalDraftTokensConfig – 使用外部草稿令牌配置的推测性解码
pTuningConfig – 提示调优配置
loraConfig – LoRA配置
lookaheadConfig – 前瞻性推测解码配置
logitsPostProcessorName – logits后处理器的名称。必须与其中一个logits后处理器相对应。
kvCacheRetentionConfig – 用于KV缓存块淘汰的配置。提供给ExecutorConfig的名称。
encoderInputTokenIds – 用于编码器-解码器模型或仅编码器模型的编码器输入令牌ID
returnAllGeneratedTokens – 指示是否在每次流式步骤后返回完整的束或仅返回新生成的令牌。
priority – 设置此请求的执行优先级。
encoderInputFeatures – 多模态模型的编码器输入特征。
encoderOutputLength – 如果编码器输入和输出的长度不同（由于卷积下采样等原因），则为编码器输出长度。
crossAttentionMask – 交叉注意力掩码。
type – 指示分解服务模式的请求类型。
contextPhaseParams – 仅从上下文执行器生成的令牌ID。
numReturnSequences – 返回序列的数量。
eagleConfig – EAGLE 推测解码配置
skipCrossAttnBlocks – 是否跳过交叉注意力变换器块。
guidedDecodingParams – 引导解码参数。
allottedTimeMs – 以毫秒为单位的分配时间，超过此时间后请求将以超时完成原因结束。请求总是会稍微超过这个时间，但最多只会超过1个前向传递。请求在调度之前就可能超时。

Request(请求 const &other)

Request(请求 &&other) noexcept

请求 &operator=(请求 const &other)

请求 &operator=(请求 &&other) noexcept

~Request()

VecTokens getInputTokenIds() const

SizeType32 getMaxTokens() const

SizeType32 getMaxNewTokens() const

bool getStreaming() const

SamplingConfig getSamplingConfig() const

输出配置 getOutputConfig() const

std::optional<SizeType32> getEndId() const

std::optional<SizeType32> getPadId() const

std::optional<std::vector<SizeType32>> getPositionIds() const

std::optional<std::list<VecTokens>> getBadWords() const

std::optional<std::list<VecTokens>> getStopWords() const

std::optional<张量> getEmbeddingBias() const

std::optional<ExternalDraftTokensConfig> getExternalDraftTokensConfig() const

std::optional<PromptTuningConfig> getPromptTuningConfig() const

std::optional<MropeConfig> getMropeConfig() const

std::optional<LoraConfig> getLoraConfig() const

std::optional<LookaheadDecodingConfig> getLookaheadConfig() const

std::optional<KvCacheRetentionConfig> getKvCacheRetentionConfig() const

std::optional<std::string> getLogitsPostProcessorName() const

std::optional<VecTokens> getEncoderInputTokenIds() const

std::optional<IdType> getClientId() const

PriorityType getPriority() const

bool getReturnAllGeneratedTokens() const

std::optional<ContextPhaseParams> const &getContextPhaseParams() const

std::optional<张量> getEncoderInputFeatures() const

std::optional<SizeType32> getEncoderOutputLength() const

std::optional<张量> getCrossAttentionMask() const

请求类型 getRequestType() const

SizeType32 getNumReturnSequences() const

std::optional<EagleConfig> getEagleConfig() const

std::optional<张量> getSkipCrossAttnBlocks() const

std::optional<GuidedDecodingParams> getGuidedDecodingParams() const

std::optional<MillisecondsType> getAllottedTimeMs() const

void setStreaming(bool streaming)

void setSamplingConfig(SamplingConfig const &config)

void setOutputConfig(输出配置 const &outputConfig)

void setEndId(SizeType32 endId)

void setPadId(SizeType32 padId)

void setPositionIds(std::vector<SizeType32> const &positionIds)

void setBadWords(std::list<VecTokens> const &badWords)

void setStopWords(std::list<VecTokens> const &stopWords)

void setEmbeddingBias(张量 const &embeddingBias)

void setExternalDraftTokensConfig(ExternalDraftTokensConfig const &externalDraftTokensConfig)

void setPromptTuningConfig(PromptTuningConfig const &pTuningConfig)

void setMropeConfig(MropeConfig const &mRopeConfig)

void setLoraConfig(LoraConfig const &loraConfig)

void setLookaheadConfig(LookaheadDecodingConfig const &lookaheadConfig)

void setKvCacheRetentionConfig(KvCacheRetentionConfig const &kvCacheRetentionConfig)

void setLogitsPostProcessorName(std::string const &logitsPostProcessorName)

void setEncoderInputTokenIds(VecTokens const &encoderInputTokenIds)

void setClientId(IdType clientId)

void setPriority(PriorityType priority)

void setReturnAllGeneratedTokens(bool returnAllGeneratedTokens)

void setRequestType(请求类型 const &requestType)

void setContextPhaseParams(ContextPhaseParams contextPhaseParams)

void setEncoderInputFeatures(张量 encoderInputFeatures)

void setEncoderOutputLength(SizeType32 encoderOutputLength)

void setCrossAttentionMask(张量 crossAttentionMask)

void setNumReturnSequences(SizeType32 numReturnSequences)

void setEagleConfig(std::optional<EagleConfig> const &eagleConfig)

void setSkipCrossAttnBlocks(张量 skipCrossAttnBlocks)

void setGuidedDecodingParams(GuidedDecodingParams const &guidedDecodingParams)

void setAllottedTimeMs(MillisecondsType allottedTimeMs)

公共静态属性

static constexpr PriorityType kDefaultPriority = 0.5

static auto constexpr kBatchedPostProcessorName = "batched": 这个logits后处理器名称将分派给批处理的logits后处理器。

私有成员

std::unique_ptr<Impl> mImpl

朋友们

friend class Serialization

class Response

#include <executor.h>

类，用于保存错误或结果。

公共函数

Response(IdType requestId, std::string errorMsg, std::optional<IdType> clientId = std::nullopt)

Response(IdType requestId, 结果 Result, std::optional<IdType> clientId = std::nullopt)

~Response()

Response(响应 const &other)

Response(响应 &&other) noexcept

响应 &operator=(响应 const &other)

响应 &operator=(响应 &&other) noexcept

IdType getRequestId() const: 获取生成此响应的请求的ID。

std::optional<IdType> getClientId() const: 获取生成此响应的请求的客户端ID。

bool hasError() const: 指示此响应是否有错误。

std::string const &getErrorMsg() const: 获取此响应的错误信息。如果hasError为false，将抛出异常。

结果 const &getResult() const: 获取此响应的结果如果hasResult为true，将抛出异常。

私有成员

std::unique_ptr<Impl> mImpl

朋友们

friend class Serialization

struct Result

#include <executor.h>

保存生成结果的结构体。

公共成员

bool isFinal: 指示这是否是请求的最终结果。

BeamTokens outputTokenIds: 每个光束的输出标记。

std::optional<VecLogProbs> cumLogProbs: 累积的对数概率。大小为beamSize。

std::optional<std::vector<VecLogProbs>> logProbs: 每个生成令牌的对数概率。大小为 [beamSize, outputLen]。

std::optional<张量> contextLogits: 上下文对数。大小 [promptLen, vocabSizePadded]。

std::optional<张量> generationLogits: 生成的logits。大小 [beamSize, maxNewTokens, vocabSizePadded]（非流式）或 [maxNewTokens, beamSize, vocabSizePadded]（流式和allGeneratedTokens）或 [1, beamSize, vocabSizePadded]（流式和非allGeneratedTokens）

std::optional<SpeculativeDecodingFastLogitsInfo> specDecFastLogitsInfo: 使用快速logits时直接传输的Logits信息。

std::optional<张量> encoderOutput: 编码器输出。大小 [encoderLen, hiddenSize]。

std::vector<完成原因> finishReasons: 模型在此请求中停止为每个光束生成令牌的原因。大小 [beamSize]。目前仅在 beamSize 为 1 且使用 BatchingType::kINFLIGHT 时支持。

std::optional<ContextPhaseParams> contextPhaseParams: 上下文阶段的参数。

SizeType32 decodingIter = {0}: 用于生成结果的解码迭代次数。在自回归解码中，它等于outputTokenIds中beam的最大长度。在推测性解码中，可能小于outputTokenIds中beam的最大长度，因为每次迭代可以生成多个token。用于推测性解码统计。

SizeType32 sequenceIndex = {0}: 此结果的输出序列索引，其中 0 <= sequenceIndex < numReturnSequences。在束搜索（beamWidth > 1）中，此索引将始终为零，因为所有要返回的束都包含在此结果中。

bool isSequenceFinal: 指示这是请求中给定序列的最终结果。在束搜索（beamWidth > 1）中，该值将始终等于isFinal的值。

std::optional<RequestPerfMetrics> requestPerfMetrics: 如果OutputConfig中设置了returnPerfMetrics，则显示性能指标。

struct RetentionPriorityAndDuration

公共函数

inline RetentionPriorityAndDuration(std::optional<保留优先级> const &retentionPriority, std::optional<std::chrono::milliseconds> const &durationMs)

公共成员

std::optional<保留优先级> retentionPriority

std::optional<std::chrono::milliseconds> durationMs

class SamplingConfig

#include <executor.h>

采样配置。

公共函数

explicit SamplingConfig(SizeType32 beamWidth = 1, std::optional<SizeType32> const &topK = std::nullopt, std::optional<FloatType> const &topP = std::nullopt, std::optional<FloatType> const &topPMin = std::nullopt, std::optional<TokenIdType> const &topPResetIds = std::nullopt, std::optional<FloatType> const &topPDecay = std::nullopt, std::optional<RandomSeedType> const &seed = std::nullopt, std::optional<FloatType> const &temperature = std::nullopt, std::optional<SizeType32> const &minTokens = std::nullopt, std::optional<FloatType> const &beamSearchDiversityRate = std::nullopt, std::optional<FloatType> const &repetitionPenalty = std::nullopt, std::optional<FloatType> const &presencePenalty = std::nullopt, std::optional<FloatType> const &frequencyPenalty = std::nullopt, std::optional<FloatType> const &lengthPenalty = std::nullopt, std::optional<SizeType32> const &earlyStopping = std::nullopt, std::optional<SizeType32> const &noRepeatNgramSize = std::nullopt, std::optional<SizeType32> const &numReturnSequences = std::nullopt): SamplingConfig的构造函数。请参阅下面参数的描述。

bool operator==(SamplingConfig const &other) const

SizeType32 getBeamWidth() const

SizeType32 getNumReturnBeams() const

std::optional<SizeType32> getTopK() const

std::optional<FloatType> getTopP() const

std::optional<FloatType> getTopPMin() const

std::optional<SizeType32> getTopPResetIds() const

std::optional<FloatType> getTopPDecay() const

std::optional<RandomSeedType> getSeed() const

std::optional<RandomSeedType> getRandomSeed() const

std::optional<FloatType> getTemperature() const

std::optional<SizeType32> getMinTokens() const

std::optional<SizeType32> getMinLength() const

std::optional<FloatType> getBeamSearchDiversityRate() const

std::optional<FloatType> getRepetitionPenalty() const

std::optional<FloatType> getPresencePenalty() const

std::optional<FloatType> getFrequencyPenalty() const

std::optional<FloatType> getLengthPenalty() const

std::optional<SizeType32> getEarlyStopping() const

std::optional<SizeType32> getNoRepeatNgramSize() const

std::optional<SizeType32> getNumReturnSequences() const

void setBeamWidth(SizeType32 beamWidth)

void setTopK(std::optional<SizeType32> const &topK)

void setTopP(std::optional<FloatType> const &topP)

void setTopPMin(std::optional<FloatType> const &topPMin)

void setTopPResetIds(std::optional<TokenIdType> const &topPResetIds)

void setTopPDecay(std::optional<FloatType> const &topPDecay)

void setSeed(std::optional<RandomSeedType> const &seed)

void setRandomSeed(std::optional<RandomSeedType> const &randomSeed)

void setTemperature(std::optional<FloatType> const &temperature)

void setMinTokens(std::optional<SizeType32> const &minTokens)

void setMinLength(std::optional<SizeType32> const &minLength)

void setBeamSearchDiversityRate(std::optional<FloatType> const &beamSearchDiversityRate)

void setRepetitionPenalty(std::optional<FloatType> const &repetitionPenalty)

void setPresencePenalty(std::optional<FloatType> const &presencePenalty)

void setFrequencyPenalty(std::optional<FloatType> const &frequencyPenalty)

void setLengthPenalty(std::optional<FloatType> const &lengthPenalty)

void setEarlyStopping(std::optional<SizeType32> const &earlyStopping)

void setNoRepeatNgramSize(std::optional<SizeType32> const &noRepeatNgramSize)

void setNumReturnSequences(std::optional<SizeType32> const &numReturnSequences)

私有函数

void updateNumReturnBeams()

私有成员

SizeType32 mBeamWidth: 光束宽度。默认值为1，这将禁用光束搜索。

std::optional<SizeType32> mTopK: 控制从多少个logits中进行采样。默认值为0（所有logits）。

std::optional<FloatType> mTopP: 控制从顶部P概率中采样的概率。默认值为0.f。

std::optional<FloatType> mTopPMin: 控制top-P算法中的衰减。topPMin是下限。默认值为1.e-6。

std::optional<TokenIdType> mTopPResetIds: 控制top-P算法中的衰减。指示在哪里重置衰减。默认值为1。

std::optional<FloatType> mTopPDecay: 控制top-P算法中的衰减。衰减值。默认值为1.f。

std::optional<RandomSeedType> mSeed: 控制随机数生成器在采样中使用的随机种子。

std::optional<FloatType> mTemperature: 控制采样新标记时logits的调制。它可以有大于0.f的值。默认值为1.0f。

std::optional<SizeType32> mMinTokens: 生成令牌数量的下限。小于1的值无效。默认值为1。

std::optional<FloatType> mBeamSearchDiversityRate: 控制束搜索中的多样性。

std::optional<FloatType> mRepetitionPenalty: 用于根据标记在序列中出现的频率对其进行惩罚。它可以有任何大于0.f的值。值小于1.f鼓励重复，值大于1.f则抑制重复。默认值为1.f。

std::optional<FloatType> mPresencePenalty: 用于惩罚序列中已经存在的标记（无论出现次数多少）。它可以有任何值。值 < 0.f 鼓励重复，值 > 0.f 则阻止重复。默认值为 0.f。

std::optional<FloatType> mFrequencyPenalty: 用于惩罚序列中已经存在的标记（取决于出现的次数）。它可以有任何值。值 < 0.f 鼓励重复，值 > 0.f 则阻止重复。默认值为 0.f。

std::optional<FloatType> mLengthPenalty: 控制如何在束搜索中惩罚较长的序列。默认值为0.f。

std::optional<SizeType32> mEarlyStopping: 控制生成过程是否在生成beamWidth个句子后结束（以end_token结束）

std::optional<SizeType32> mNoRepeatNgramSize: 控制可接受的重复ngram大小的数量。默认值为1 << 30。

std::optional<SizeType32> mNumReturnSequences: 返回序列或波束的数量。在波束搜索中，该值应小于或等于mBeamWidth。在采样中，它指定独立生成的序列的总数。

SizeType32 mNumReturnBeams: 返回的束数。除非设置了numReturnSequences，否则它等于beamWidth。如果beamWidth > 1并且设置了numReturnSequences，则numReturnBeams等于numReturnSequences。

私有静态函数

static SizeType32 checkBeamWidth(SizeType32 beamWidth)

static std::optional<FloatType> const &checkTopK(std::optional<FloatType> const &topK)

static std::optional<FloatType> const &checkTopP(std::optional<FloatType> const &topP)

static std::optional<FloatType> const &checkTopPMin(std::optional<FloatType> const &topPMin)

static std::optional<TokenIdType> const &checkTopPResetIds(std::optional<TokenIdType> const &topPResetIds)

static std::optional<FloatType> const &checkTopPDecay(std::optional<FloatType> const &topPDecay)

static std::optional<FloatType> const &checkTemperature(std::optional<FloatType> const &temperature)

static std::optional<FloatType> const &checkRepetitionPenalty(std::optional<FloatType> const &penalty)

static std::optional<SizeType32> const &checkMinTokens(std::optional<SizeType32> const &minTokens)

static std::optional<SizeType32> const &checkNoRepeatNgramSize(std::optional<SizeType32> const &noRepeatNgramSize)

static std::optional<FloatType> const &checkBeamSearchDiversityRate(std::optional<FloatType> const &beamSearchDiversityRate)

static std::optional<SizeType32> const &checkNumReturnSequences(std::optional<SizeType32> const &numReturnSequences, SizeType32 beamWidth)

朋友们

friend class Serialization

class SchedulerConfig

#include <executor.h>

调度程序的配置类。

公共函数

explicit SchedulerConfig(CapacitySchedulerPolicy capacitySchedulerPolicy = CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT, std::optional<ContextChunkingPolicy> contextChunkingPolicy = std::nullopt, std::optional<DynamicBatchConfig> dynamicBatchConfig = std::nullopt)

bool operator==(SchedulerConfig const &other) const

CapacitySchedulerPolicy getCapacitySchedulerPolicy() const

std::optional<ContextChunkingPolicy> getContextChunkingPolicy() const

std::optional<动态批处理配置> getDynamicBatchConfig() const

私有成员

CapacitySchedulerPolicy mCapacitySchedulerPolicy: 容量调度器策略。参见CapacitySchedulerPolicy。

std::optional<ContextChunkingPolicy> mContextChunkingPolicy: 上下文分块策略。参见ContextChunkingPolicy。

std::optional<动态批处理配置> mDynamicBatchConfig: 用于动态调整批量大小的配置。请参阅DynamicBatchSizeConfig。

朋友们

friend class Serialization

class SpeculativeDecodingConfig

#include <executor.h>

推测解码的配置（包括草稿和目标模型）

公共函数

explicit SpeculativeDecodingConfig(bool fastLogits = false)

bool operator==(SpeculativeDecodingConfig const &other) const

公共成员

bool fastLogits: 直接将logits张量从草稿发送到目标模型。

struct SpeculativeDecodingFastLogitsInfo

#include <executor.h>

在使用直接传输时保存logits信息的结构。

公共函数

张量 toTensor() const: 返回序列化为张量的结构，该张量可用作生成逻辑输入。

公共成员

uint64_t draftRequestId: 草稿请求ID。

int32_t draftParticipantId: 草稿模型领导者的MPI世界排名。

namespace mpi

serialization.h

namespace tensorrt_llm

namespace executor

class Serialization

公共静态函数

static RequestPerfMetrics::时间点 deserializeTimePoint(std::istream &is)

static void serialize(RequestPerfMetrics::时间点 const &tp, std::ostream &os)

static size_t serializedSize(RequestPerfMetrics::时间点 const&)

static RequestPerfMetrics deserializeRequestPerfMetrics(std::istream &is)

static void serialize(RequestPerfMetrics const &metrics, std::ostream &os)

static size_t serializedSize(RequestPerfMetrics const &metrics)

static SamplingConfig deserializeSamplingConfig(std::istream &is)

static void serialize(SamplingConfig const &config, std::ostream &os)

static size_t serializedSize(SamplingConfig const &config)

static 输出配置 deserializeOutputConfig(std::istream &is)

static void serialize(输出配置 const &config, std::ostream &os)

static size_t serializedSize(输出配置 const &config)

static ExternalDraftTokensConfig deserializeExternalDraftTokensConfig(std::istream &is)

static void serialize(ExternalDraftTokensConfig const &config, std::ostream &os)

static size_t serializedSize(ExternalDraftTokensConfig const &config)

static PromptTuningConfig deserializePromptTuningConfig(std::istream &is)

static void serialize(PromptTuningConfig const &config, std::ostream &os)

static size_t serializedSize(PromptTuningConfig const &config)

static MropeConfig deserializeMropeConfig(std::istream &is)

static void serialize(MropeConfig const &config, std::ostream &os)

static size_t serializedSize(MropeConfig const &config)

static LoraConfig deserializeLoraConfig(std::istream &is)

static void serialize(LoraConfig const &config, std::ostream &os)

static size_t serializedSize(LoraConfig const &config)

static kv_cache::CommState deserializeCommState(std::istream &is)

static void serialize(kv_cache::CommState const &state, std::ostream &os)

static size_t serializedSize(kv_cache::CommState const &state)

static kv_cache::SocketState deserializeSocketState(std::istream &is)

static void serialize(kv_cache::SocketState const &state, std::ostream &os)

static size_t serializedSize(kv_cache::SocketState const &state)

static kv_cache::CacheState deserializeCacheState(std::istream &is)

static void serialize(kv_cache::CacheState const &state, std::ostream &os)

static size_t serializedSize(kv_cache::CacheState const &state)

static DataTransceiverState deserializeDataTransceiverState(std::istream &is)

static void serialize(DataTransceiverState const &dataTransceiverState, std::ostream &os)

static size_t serializedSize(DataTransceiverState const &dataTransceiverState)

static ContextPhaseParams deserializeContextPhaseParams(std::istream &is)

static void serialize(ContextPhaseParams const &contextPhaseParams, std::ostream &os)

static size_t serializedSize(ContextPhaseParams const &contextPhaseParams)

static 请求 deserializeRequest(std::istream &is)

static void serialize(请求 const &request, std::ostream &os)

static size_t serializedSize(请求 const &request)

static 张量 deserializeTensor(std::istream &is)

static void serialize(张量 const &tensor, std::ostream &os)

static size_t serializedSize(张量 const &tensor)

static SpeculativeDecodingFastLogitsInfo deserializeSpecDecFastLogitsInfo(std::istream &is)

static void serialize(SpeculativeDecodingFastLogitsInfo const &info, std::ostream &os)

static size_t serializedSize(SpeculativeDecodingFastLogitsInfo const &info)

static 结果 deserializeResult(std::istream &is)

static void serialize(结果 const &result, std::ostream &os)

static size_t serializedSize(结果 const &result)

static 响应 deserializeResponse(std::istream &is)

static void serialize(响应 const &response, std::ostream &os)

static size_t serializedSize(响应 const &response)

static std::vector<响应> deserializeResponses(std::vector<char> &buffer)

static std::vector<char> serialize(std::vector<响应> const &responses)

static KvCacheConfig deserializeKvCacheConfig(std::istream &is)

static void serialize(KvCacheConfig const &kvCacheConfig, std::ostream &os)

static size_t serializedSize(KvCacheConfig const &kvCacheConfig)

static 动态批处理配置 deserializeDynamicBatchConfig(std::istream &is)

static void serialize(动态批处理配置 const &dynamicBatchConfig, std::ostream &os)

static size_t serializedSize(动态批处理配置 const &dynamicBatchConfig)

static SchedulerConfig deserializeSchedulerConfig(std::istream &is)

static void serialize(SchedulerConfig const &schedulerConfig, std::ostream &os)

static size_t serializedSize(SchedulerConfig const &schedulerConfig)

static ExtendedRuntimePerfKnobConfig deserializeExtendedRuntimePerfKnobConfig(std::istream &is)

static void serialize(ExtendedRuntimePerfKnobConfig const &extendedRuntimePerfKnobConfig, std::ostream &os)

static size_t serializedSize(ExtendedRuntimePerfKnobConfig const &extendedRuntimePerfKnobConfig)

static ParallelConfig deserializeParallelConfig(std::istream &is)

static void serialize(ParallelConfig const &parallelConfig, std::ostream &os)

static size_t serializedSize(ParallelConfig const &parallelConfig)

static PeftCacheConfig deserializePeftCacheConfig(std::istream &is)

static void serialize(PeftCacheConfig const &peftCacheConfig, std::ostream &os)

static size_t serializedSize(PeftCacheConfig const &peftCacheConfig)

static OrchestratorConfig deserializeOrchestratorConfig(std::istream &is)

static void serialize(OrchestratorConfig const &orchestratorConfig, std::ostream &os)

static size_t serializedSize(OrchestratorConfig const &orchestratorConfig)

static 解码模式 deserializeDecodingMode(std::istream &is)

static void serialize(解码模式 const &decodingMode, std::ostream &os)

static size_t serializedSize(解码模式 const &decodingMode)

static LookaheadDecodingConfig deserializeLookaheadDecodingConfig(std::istream &is)

static void serialize(LookaheadDecodingConfig const &lookaheadDecodingConfig, std::ostream &os)

static size_t serializedSize(LookaheadDecodingConfig const &lookaheadDecodingConfig)

static EagleConfig deserializeEagleConfig(std::istream &is)

static void serialize(EagleConfig const &eagleConfig, std::ostream &os)

static size_t serializedSize(EagleConfig const &eagleConfig)

static SpeculativeDecodingConfig deserializeSpeculativeDecodingConfig(std::istream &is)

static void serialize(SpeculativeDecodingConfig const &specDecConfig, std::ostream &os)

static size_t serializedSize(SpeculativeDecodingConfig const &specDecConfig)

static GuidedDecodingConfig deserializeGuidedDecodingConfig(std::istream &is)

static void serialize(GuidedDecodingConfig const &guidedDecodingConfig, std::ostream &os)

static size_t serializedSize(GuidedDecodingConfig const &guidedDecodingConfig)

static GuidedDecodingParams deserializeGuidedDecodingParams(std::istream &is)

static void serialize(GuidedDecodingParams const &guidedDecodingParams, std::ostream &os)

static size_t serializedSize(GuidedDecodingParams const &guidedDecodingParams)

static KvCacheRetentionConfig deserializeKvCacheRetentionConfig(std::istream &is)

static void serialize(KvCacheRetentionConfig const &kvCacheRetentionConfig, std::ostream &os)

static size_t serializedSize(KvCacheRetentionConfig const &kvCacheRetentionConfig)

static KvCacheRetentionConfig::TokenRangeRetentionConfig deserializeTokenRangeRetentionConfig(std::istream &is)

static void serialize(KvCacheRetentionConfig::TokenRangeRetentionConfig const &tokenRangeRetentionConfig, std::ostream &os)

static size_t serializedSize(KvCacheRetentionConfig::TokenRangeRetentionConfig const &tokenRangeRetentionConfig)

static DecodingConfig deserializeDecodingConfig(std::istream &is)

static void serialize(DecodingConfig const &decodingConfig, std::ostream &os)

static size_t serializedSize(DecodingConfig const &decodingConfig)

static DebugConfig deserializeDebugConfig(std::istream &is)

static void serialize(DebugConfig const &debugConfig, std::ostream &os)

static size_t serializedSize(DebugConfig const &debugConfig)

static ExecutorConfig deserializeExecutorConfig(std::istream &is)

static void serialize(ExecutorConfig const &executorConfig, std::ostream &os)

static size_t serializedSize(ExecutorConfig const &executorConfig)

static KvCacheStats deserializeKvCacheStats(std::istream &is)

static void serialize(KvCacheStats const &kvCacheStats, std::ostream &os)

static size_t serializedSize(KvCacheStats const &kvCacheStats)

static 静态批处理统计 deserializeStaticBatchingStats(std::istream &is)

static void serialize(静态批处理统计 const &staticBatchingStats, std::ostream &os)

static size_t serializedSize(静态批处理统计 const &staticBatchingStats)

static InflightBatchingStats deserializeInflightBatchingStats(std::istream &is)

static void serialize(InflightBatchingStats const &inflightBatchingStats, std::ostream &os)

static size_t serializedSize(InflightBatchingStats const &inflightBatchingStats)

static 迭代统计 deserializeIterationStats(std::vector<char> &buffer)

static 迭代统计 deserializeIterationStats(std::istream &is)

static void serialize(迭代统计 const &iterStats, std::ostream &os)

static std::vector<char> serialize(迭代统计 const &iterStats)

static size_t serializedSize(迭代统计 const &iterStats)

static std::vector<char> serialize(std::vector<迭代统计> const &iterStatsVec)

static std::vector<迭代统计> deserializeIterationStatsVec(std::vector<char> &buffer)

static DisServingRequestStats deserializeDisServingRequestStats(std::istream &is)

static void serialize(DisServingRequestStats const &stats, std::ostream &os)

static size_t serializedSize(DisServingRequestStats const &disServingRequestStats)

static 请求阶段 deserializeRequestStage(std::istream &is)

static void serialize(请求阶段 const &requestStage, std::ostream &os)

static size_t serializedSize(请求阶段 const &requestStage)

static 请求统计 deserializeRequestStats(std::istream &is)

static void serialize(请求统计 const &state, std::ostream &os)

static size_t serializedSize(请求统计 const &state)

static RequestStatsPerIteration deserializeRequestStatsPerIteration(std::istream &is)

static RequestStatsPerIteration deserializeRequestStatsPerIteration(std::vector<char> &buffer)

static void serialize(RequestStatsPerIteration const &state, std::ostream &os)

static std::vector<char> serialize(RequestStatsPerIteration const &state)

static size_t serializedSize(RequestStatsPerIteration const &state)

static std::vector<char> serialize(std::vector<RequestStatsPerIteration> const &requestStatsVec)

static std::vector<RequestStatsPerIteration> deserializeRequestStatsPerIterationVec(std::vector<char> &buffer)

static std::string deserializeString(std::istream &is)

static bool deserializeBool(std::istream &is)

static 模型类型 deserializeModelType(std::istream &is)

namespace kv_cache

tensor.h

namespace tensorrt_llm

namespace executor

class Shape : public tensorrt_llm::common::ArrayView<详情::DimType64 const>

公共类型

using Base = tensorrt_llm::common::ArrayView<详情::DimType64 const>

using DimType64 = typename std::remove_cv_t<基础::value_type>

公共函数

inline Shape()

inline Shape(DimType64 const *data, 基础::size_type size)

inline Shape(std::initializer_list<DimType64> dims)

class Tensor

公共类型

using CudaStreamPtr = std::shared_ptr<runtime::CudaStream>

公共函数

张量 copyToCpu(张量::CudaStreamPtr stream = nullptr) const

张量 copyToPinned(张量::CudaStreamPtr stream = nullptr) const

张量 copyToPooledPinned(张量::CudaStreamPtr stream = nullptr) const

张量 copyToManaged(张量::CudaStreamPtr stream = nullptr) const

张量 copyToGpu(张量::CudaStreamPtr stream) const

Tensor() noexcept = default

~Tensor() = default

Tensor(张量 const &other) noexcept = default

Tensor(张量 &&other) noexcept = default

张量 &operator=(张量 const &other) noexcept = default

张量 &operator=(张量 &&other) noexcept = default

void *getData(): 返回指向底层数组的指针。

void const *getData() const: 返回指向底层数组的指针。

数据类型 getDataType() const: 返回缓冲区的数据类型。

MemoryType getMemoryType() const: 返回缓冲区的内存类型。

形状 getShape() const: 返回张量的维度。

std::size_t getSize() const: 返回张量中的元素数量。

std::size_t getSizeInBytes() const: 返回张量的大小，单位为字节。

void setZero(CudaStreamPtr stream = nullptr)

将整个内存设置为零。

Parameters:: stream – 如果内存类型是GPU，则必须是一个有效的CUDA流。

void setFrom(张量 const &other, CudaStreamPtr stream = nullptr)

从另一个张量复制数据和形状。

Parameters:

other – 一个要从中复制的张量。
stream – 如果内存类型是GPU，则必须是一个有效的CUDA流。

inline explicit operator bool() const

inline bool operator==(张量 const &rhs) const

inline bool operator!=(张量 const &rhs) const

公共静态函数

static 张量 cpu(数据类型 dataType, 形状 shape = {})

分配一个具有给定形状和数据类型的CPU张量。

Parameters:

shape – 张量的形状。
dataType – 张量的数据类型。

template<typename T> static inline 张量 cpu(形状 shape = {})

static 张量 pinned(数据类型 dataType, 形状 shape = {})

在固定内存中分配一个具有给定形状和数据类型的CPU张量。

Parameters:

shape – 张量的形状。
dataType – 张量的数据类型。

template<typename T> static inline 张量 pinned(形状 shape = {})

static 张量 pooledPinned(数据类型 dataType, 形状 shape = {})

在池化的固定内存中分配一个具有给定形状和数据类型的CPU张量。

Parameters:

shape – 张量的形状。
dataType – 张量的数据类型。

template<typename T> static inline 张量 pooledPinned(形状 shape = {})

static 张量 managed(数据类型 dataType, 形状 shape = {})

在托管内存（UVM）中分配一个具有给定形状和数据类型的张量。

Parameters:

shape – 张量的形状。
dataType – 张量的数据类型。

template<typename T> static inline 张量 managed(形状 shape = {})

static 张量 gpu(数据类型 dataType, CudaStreamPtr stream, 形状 shape = {})

在特定的cuda流上分配具有给定形状和数据类型的gpu张量。

Parameters:

shape – 张量的形状。
stream – 指定在哪个CUDA流上为GPU内存分配张量。
dataType – 张量的数据类型。

template<typename T> static inline 张量 gpu(CudaStreamPtr stream, 形状 shape = {})

static 张量 of(数据类型 dataType, void *data, 形状 shape)

将数据指针包装成张量而不获取所有权。

Parameters:

shape – 张量的形状。
dataType – 张量的数据类型。
stream – 指定在哪个CUDA流上为GPU内存分配张量。

template<typename T> static inline 张量 of(T *data, 形状 shape)

将数据指针包装成张量而不获取所有权。

Parameters:

shape – 张量的形状。
dataType – 张量的数据类型。
stream – 指定在哪个CUDA流上为GPU内存分配张量。

template<typename T> static inline 张量 of(T &data)

将任何容器包装成张量而不获取所有权。

Parameters:

shape – 张量的形状。
dataType – 张量的数据类型。
stream – 指定在哪个CUDA流上为GPU内存分配张量。

私有类型

using Impl = runtime::ITensor 

私有函数

explicit Tensor(std::shared_ptr<runtime::ITensor> tensor)

张量 copyTo(std::shared_ptr<Impl> tensor, CudaStreamPtr stream) const

私有成员

std::shared_ptr<Impl> mTensor

私有静态函数

template<typename T> static inline 数据类型 getRuntimeType()

朋友们

friend class Serialization

friend std::shared_ptr<runtime::ITensor> const &toITensor(张量 const &tensor)

friend 张量 ofITensor(std::shared_ptr<runtime::ITensor> tensor)

namespace detail

类型定义

using DimType64 = int64_t

函数

std::shared_ptr<runtime::ITensor> const &toITensor(张量 const &tensor)

张量 ofITensor(std::shared_ptr<runtime::ITensor> tensor)

namespace runtime

types.h

template<> struct TypeTraits<std::int8_t>

公共静态属性

static constexpr auto value = 数据类型::kINT8 

template<> struct TypeTraits<std::int32_t>

公共静态属性

static constexpr auto value = 数据类型::kINT32 

template<> struct TypeTraits<std::int64_t>

公共静态属性

static constexpr auto value = 数据类型::kINT64 

template<> struct TypeTraits<std::uint8_t>

公共静态属性

static constexpr auto value = 数据类型::kUINT8 

namespace tensorrt_llm

namespace executor

类型定义

using TensorPtr = std::shared_ptr<张量>

using SizeType32 = std::int32_t

using FloatType = float

using TokenIdType = std::int32_t

using VecTokens = std::vector<TokenIdType>

using BeamTokens = std::vector<VecTokens>

using IdType = std::uint64_t

using VecTokenExtraIds = std::vector<IdType>

using IterationType = std::uint64_t

using RandomSeedType = std::uint64_t

using VecLogProbs = std::vector<FloatType>

using StreamPtr = std::shared_ptr<tensorrt_llm::runtime::CudaStream>

using MillisecondsType = std::chrono::milliseconds

using LogitsPostProcessor = std::function<void(IdType, 张量&, BeamTokens const&, StreamPtr const&, std::optional<IdType>)>

using LogitsPostProcessorMap = std::unordered_map<std::string, LogitsPostProcessor>

using LogitsPostProcessorBatched = std::function<void(std::vector<IdType> const&, std::vector<张量>&, std::vector<std::reference_wrapper<BeamTokens const>> const&, StreamPtr const&, std::vector<std::optional<IdType>> const&)>

using MedusaChoices = std::vector<std::vector<SizeType32>>

using EagleChoices = std::vector<std::vector<SizeType32>>

using PriorityType = float

using BufferView = std::basic_string_view<uint8_t>

枚举

enum class DataType

值：

enumerator kBOOL

enumerator kUINT8

enumerator kINT8

enumerator kINT32

enumerator kINT64

enumerator kBF16

enumerator kFP8

enumerator kFP16

enumerator kFP32

enumerator kUNKNOWN

enum class RequestType

值：

enumerator REQUEST_TYPE_CONTEXT_AND_GENERATION

enumerator REQUEST_TYPE_CONTEXT_ONLY

enumerator REQUEST_TYPE_GENERATION_ONLY

enum class MemoryType

值：

enumerator kCPU

enumerator kCPU_PINNED

enumerator kCPU_PINNEDPOOL

enumerator kGPU

enumerator kUVM

enumerator kUNKNOWN

enum class ModelType

值：

enumerator kDECODER_ONLY

enumerator kENCODER_ONLY

enumerator kENCODER_DECODER

enum class BatchingType

批处理类型。

值：

enumerator kSTATIC: STATIC 指的是传统的批处理方案，其中一批请求同步运行，直到所有请求的完整生成完成。批处理中的请求都会被填充到批处理中任何成员的最大输入和输出序列长度。

enumerator kINFLIGHT: INFLIGHT 指的是一种方案，其中新到达的请求被动态地纳入正在执行的批次中，并且一旦满足结束条件，请求就会立即返回，无需任何填充。

enum class CapacitySchedulerPolicy

用于在执行器生成循环的每次迭代中选择可用请求子集的策略。

值：

enumerator kMAX_UTILIZATION: MAX_UTILIZATION 在 InflightBatching 生成循环的每次迭代中，尽可能多地打包底层 TRT 引擎支持的请求。虽然这有望最大化 GPU 吞吐量，但根据峰值 KV 缓存内存的可用性，可能需要暂停并重新启动某些请求。

enumerator kGUARANTEED_NO_EVICT: GUARANTEED_NO_EVICT 更保守地使用 KV 缓存，保证一旦请求开始，将运行到完成而不会被驱逐。

enumerator kSTATIC_BATCH: kSTATIC_BATCH 在当前批次中的所有请求完成之前不会调度新的请求。类似于 kGUARANTEED_NO_EVICT，请求将运行到完成而不会被驱逐。

enum class ContextChunkingPolicy

值：

enumerator kFIRST_COME_FIRST_SERVED: 顺序分块，首先完成未完成的上下文阶段。

enumerator kEQUAL_PROGRESS: 依次遍历每个上下文请求，并尝试增加其块计数，直到超过约束条件。

enum class CommunicationType

值：

enumerator kMPI

enum class CommunicationMode

值：

enumerator kLEADER

enumerator kORCHESTRATOR

enum class RequestStage

表示请求状态的枚举类。

值：

enumerator kQUEUED: 请求已接收但尚未包含在活动请求中的请求（例如，由于最大批量大小等限制）。

enumerator kENCODER_IN_PROGRESS: 编码器阶段中的活动请求。

enumerator kCONTEXT_IN_PROGRESS: 上下文阶段中的活动请求。

enumerator kGENERATION_IN_PROGRESS: 生成阶段中的活动请求。

enumerator kGENERATION_COMPLETE: 生成已完成的活动请求。

enum class FinishReason

模型停止为请求生成令牌的原因。

值：

enumerator kNOT_FINISHED: 请求尚未完成。

enumerator kEND_ID: 请求已完成，因为生成了结束ID。

enumerator kSTOP_WORDS: 请求完成，因为生成了一个停止词。

enumerator kLENGTH: 请求完成，因为达到了最大令牌数。

enumerator kTIMED_OUT: 请求因超时而完成（通过mAllotedTime参数）

enumerator kCANCELLED: 请求通过调用cancelRequest被取消。

函数

std::ostream &operator<<(std::ostream &os, CapacitySchedulerPolicy policy)

std::ostream &operator<<(std::ostream &os, ContextChunkingPolicy policy)

struct DebugTensorsPerIteration

#include <types.h>

保存迭代中调试张量的结构。

公共成员

迭代类型 iter: 这些张量的迭代ID。

std::map<std::string, 张量> debugTensors: 本次迭代的调试张量。

class DecodingMode

#include <types.h>

解码器的模式

公共类型

using UnderlyingType = uint32_t

公共函数

inline auto constexpr useTemperature(bool useTemp)

inline auto constexpr useOccurrencePenalties(bool usePenalty)

inline auto constexpr usePresencePenalty(bool usePenalty)

inline auto constexpr useRepetitionPenalty(bool usePenalty)

inline auto constexpr useFrequencyPenalty(bool usePenalty)

inline auto constexpr useMinLength(bool useMinLen)

inline auto constexpr useBanTokens(bool banTokens)

inline auto constexpr useBanWords(bool banWords)

inline auto constexpr useNoRepeatNgramSize(bool noRepeatNgramSize)

inline auto constexpr useStopWords(bool stopWords)

inline auto constexpr useMaxLengthStop(bool maxLengthStop)

inline auto constexpr useExplicitEosStop(bool explicitEosStop)

inline bool constexpr isAuto() const

inline bool constexpr isTopK() const

inline bool constexpr isTopP() const

inline bool constexpr isTopKorTopP() const

inline bool constexpr isTopKandTopP() const

inline bool constexpr isBeamSearch() const

inline bool constexpr isMedusa() const

inline bool constexpr isLookahead() const

inline bool constexpr isExplicitDraftTokens() const

inline bool constexpr isExternalDraftTokens() const

inline bool constexpr isEagle() const

inline bool constexpr isUseTemperature() const

inline bool constexpr isUsePresencePenalty() const

inline bool constexpr isUseFrequencyPenalty() const

inline bool constexpr isUseRepetitionPenalty() const

inline bool constexpr isUseMinLength() const

inline bool constexpr isUseOccurrencePenalty() const

inline bool constexpr isUsePenalty() const

inline bool constexpr isUseBanWords() const

inline bool constexpr isUseNoRepeatNgramSize() const

inline bool constexpr isUseBanTokens() const

inline bool constexpr isUseStopWords() const

inline bool constexpr isUseMaxLengthStop() const

inline bool constexpr isUseExplicitEosStop() const

inline bool constexpr isUseStopCriteria() const

inline bool operator==(解码模式 const &other) const

inline explicit constexpr DecodingMode(基础类型 state)

inline constexpr 基础类型 getState() const

公共静态函数

static inline auto constexpr Auto(): 未指定模式。配置将在运行时根据第一个请求的beam宽度确定，如果beamWidth == 1，则使用TopKTopP，否则使用BeamSearch。

static inline auto constexpr TopK()

static inline auto constexpr TopP()

static inline auto constexpr TopKTopP()

static inline auto constexpr BeamSearch()

static inline auto constexpr Medusa()

static inline auto constexpr Lookahead()

static inline auto constexpr ExplicitDraftTokens()

static inline auto constexpr ExternalDraftTokens()

static inline auto constexpr Eagle()

私有函数

inline bool constexpr anyBitSet(基础类型 bits) const

inline bool constexpr allBitSet(基础类型 bits) const

inline 基础类型 constexpr setBitTo(基础类型 state, bool x)

私有成员

基础类型 mState = {}

私有静态属性

static 基础类型 constexpr kUseRepetitionPenalties = {1u << 0}

static 基础类型 constexpr kUseFrequencyPenalties = {1u << 1}

static 基础类型 constexpr kUsePresencePenalties = {1u << 2}

static 基础类型 constexpr kUseTemperature = {1u << 3}

static 基础类型 constexpr kUseMinLength = {1u << 4}

static 基础类型 constexpr kUseBanWords = {1u << 5}

static 基础类型 constexpr kUseStopWords = {1u << 6}

static 基础类型 constexpr kUseMaxLengthStop = {1u << 7}

static 基础类型 constexpr kUseExplicitEosStop = {1u << 8}

static 基础类型 constexpr kUseNoRepeatNgramSize = {1u << 9}

static 基础类型 constexpr kStandardStopCriteria = {kUseStopWords | kUseMaxLengthStop}

static 基础类型 constexpr kUseOccurrencePenalties{kUseRepetitionPenalties | kUseFrequencyPenalties | kUsePresencePenalties}

static 基础类型 constexpr kUsePenalties = {kUseOccurrencePenalties | kUseTemperature | kUseMinLength}

static 基础类型 constexpr kUseBanTokens = {kUseNoRepeatNgramSize | kUseBanWords}

static SizeType32 constexpr kNumFlags = {10}

static 基础类型 constexpr kAuto = {1u << (kNumFlags + 0)}

static 基础类型 constexpr kTopK = {1u << (kNumFlags + 1)}

static 基础类型 constexpr kTopP = {1u << (kNumFlags + 2)}

static 基础类型 constexpr kBeamSearch = {1u << (kNumFlags + 3)}

static 基础类型 constexpr kMedusa = {1u << (kNumFlags + 4)}

static 基础类型 constexpr kLookahead = {1u << (kNumFlags + 5)}

static 基础类型 constexpr kExplicitDraftTokens = {1u << (kNumFlags + 6)}

static 基础类型 constexpr kExternalDraftTokens = {1u << (kNumFlags + 7)}

static 基础类型 constexpr kEagle = {1u << (kNumFlags + 8)}

static 基础类型 constexpr kTopKTopP = {kTopK | kTopP}

struct DisServingRequestStats

#include <types.h>

在分散服务的情况下，保存请求统计信息的结构。

公共成员

double kvCacheTransferMS: 从上下文阶段转移到生成阶段所花费的总时间（毫秒）

struct InflightBatchingStats

#include <types.h>

保存单次迭代中飞行批处理模型统计信息的结构体。

公共成员

SizeType32 numScheduledRequests: 计划请求的数量。

SizeType32 numContextRequests: 上下文阶段中的请求数量。

SizeType32 numGenRequests: 生成阶段的请求数量。

SizeType32 numPausedRequests: 暂停请求的数量。

SizeType32 numCtxTokens: 迭代中的上下文令牌总数。

SizeType32 microBatchId: 微批次的索引。

float avgNumDecodedTokensPerIter: 每次迭代每个请求解码的平均令牌数。

struct IterationStats

#include <types.h>

保存单次迭代统计信息的结构体。

公共成员

std::string timestamp: 本次迭代的结束时间。

迭代类型 iter: 迭代ID。

double iterLatencyMS: 迭代延迟（毫秒）

double newActiveRequestsQueueLatencyMS: 在此迭代中变为活动的请求在队列中花费的总时间（毫秒）

SizeType32 numNewActiveRequests: 新获取的活动请求数量。

SizeType32 numActiveRequests: 活跃请求的数量。

SizeType32 numQueuedRequests: 排队请求的数量。

SizeType32 numCompletedRequests: 在此迭代中完成的请求数量。

SizeType32 maxNumActiveRequests: 最大活动请求数。

SizeType32 maxBatchSizeStatic: 传递给执行器的静态最大批量大小。

SizeType32 maxBatchSizeTunerRecommended: 基于输入统计信息由动态调谐器生成的批量大小。

SizeType32 maxBatchSizeRuntime: @brife maxBatchSizeStatic 和 maxBatchSizeRuntimeUpperbound 的最小值

SizeType32 maxNumTokensStatic: @brife 传递给执行器的静态最大令牌数

SizeType32 maxNumTokensTunerRecommended: @brife 动态调谐器根据输入统计数据生成的最大令牌数

SizeType32 maxNumTokensRuntime: @brife 运行时最大令牌数

size_t gpuMemUsage: GPU内存使用量（以字节为单位）。

size_t cpuMemUsage: CPU内存使用量，单位为字节。

size_t pinnedMemUsage: 固定内存使用量（以字节为单位）。

std::optional<KvCacheStats> kvCacheStats: 特定于KV缓存的统计信息。

std::optional<KvCacheStats> crossKvCacheStats: 特定于跨KV缓存的统计信息。

std::optional<静态批处理统计> staticBatchingStats: 特定于静态批处理的统计信息。

std::optional<InflightBatchingStats> inflightBatchingStats: 与飞行中批处理相关的统计信息。

struct KvCacheStats

#include <types.h>

保存KV缓存管理器统计信息的结构体。

公共成员

SizeType32 maxNumBlocks: 最大块数。

SizeType32 freeNumBlocks: 空闲块的数量。

SizeType32 usedNumBlocks: 已使用的块数。

SizeType32 tokensPerBlock: 每个区块的令牌数量。

SizeType32 allocTotalBlocks: 已分配块的总数。

SizeType32 allocNewBlocks: 新分配的块数。

SizeType32 reusedBlocks: 重复使用的块数。

SizeType32 missedBlocks: 未重复使用的块数。

float cacheHitRate: 测量KV缓存重用率。cacheHitRate = reusedBlocks / (reusedBlocks + missedBlocks)。

struct RequestPerfMetrics

#include <types.h>

保存请求统计信息的结构体。

公共类型

using TimePoint = std::chrono::time_point<std::chrono::steady_clock>

公共成员

TimingMetrics timingMetrics

KvCacheMetrics kvCacheMetrics

std::optional<迭代类型> firstIter: 第一次迭代处理请求的地方。

std::optional<迭代类型> lastIter: 最后一次生成令牌的迭代。

std::optional<迭代类型> iter: 当前迭代。

struct KvCacheMetrics

公共成员

SizeType32 numTotalAllocatedBlocks = {0}: 已分配块的总数。

SizeType32 numNewAllocatedBlocks = {0}: 新分配的块数。

SizeType32 numReusedBlocks = {0}: 重复使用的块数。

SizeType32 numMissedBlocks = {0}: 错过的区块数量。

SizeType32 kvCacheHitRate = {0}: KV缓存命中率，定义为重用块数 / (重用块数 + 未命中块数)

struct TimingMetrics

公共成员

时间点 arrivalTime: 请求到达的时间。

时间点 firstScheduledTime: 请求首次被调度的时间。

时间点 firstTokenTime: 生成第一个令牌的时间。

时间点 lastTokenTime: 请求完成的时间。

时间点 kvCacheTransferStart: KV缓存传输的开始时间，用于分解服务。

时间点 kvCacheTransferEnd: 用于分解服务的KV缓存传输的结束时间。

struct RequestStats

#include <types.h>

保存单个请求统计信息的结构体。

公共成员

IdType id: 请求ID。

请求阶段 stage: 请求当前所处的阶段。

SizeType32 contextPrefillPosition: 如果使用分块上下文，则为当前上下文预填充位置。

SizeType32 numGeneratedTokens: 到目前为止生成的令牌数量。

float avgNumDecodedTokensPerIter: 每次迭代解码的平均令牌数。对于推测性解码，它 >= 1。

bool scheduled: 请求是否已安排在当前迭代中。

bool paused: 请求是否由于资源不足（例如KV缓存块耗尽）而在当前迭代中被暂停

std::optional<DisServingRequestStats> disServingStats: 特定于分散服务的统计信息。

SizeType32 allocTotalBlocksPerRequest: 每个请求分配的总块数。

SizeType32 allocNewBlocksPerRequest: 每个请求新分配的块数。

SizeType32 reusedBlocksPerRequest: 每个请求中重复使用的块数。

SizeType32 missedBlocksPerRequest: 每个请求中错过的区块数量。

SizeType32 kvCacheHitRatePerRequest: 每个请求的KV缓存命中率，定义为 reusedBlocks / (reusedBlocks + missedBlocks)

struct RequestStatsPerIteration

#include <types.h>

保存迭代中所有请求统计信息的结构体。

公共成员

迭代类型 iter: 这些统计数据的迭代ID。

std::vector<请求统计> requestStats: 此迭代中所有活动请求的统计信息。

struct StaticBatchingStats

#include <types.h>

保存单次迭代中静态批处理模型统计信息的结构体。

公共成员

SizeType32 numScheduledRequests: 计划请求的数量。

SizeType32 numContextRequests: 上下文阶段中的请求数量。

SizeType32 numCtxTokens: 迭代中的上下文令牌总数。

SizeType32 numGenTokens: 迭代中生成的令牌总数。

SizeType32 emptyGenSlots: 未使用的生成令牌槽的总数。

template<typename T, bool = false> struct TypeTraits: #include <types.h>

将C++数据类型转换为TrtLmmDataType。

template<> struct TypeTraits<bool>

公共静态属性

static constexpr auto value = 数据类型::kBOOL 

template<> struct TypeTraits<float>

公共静态属性

static constexpr auto value = 数据类型::kFP32 

template<> struct TypeTraits<half>

公共静态属性

static constexpr auto value = 数据类型::kFP16 

template<> int32_t >

公共静态属性

static constexpr auto value = 数据类型::kINT32 

template<> int64_t >

公共静态属性

static constexpr auto value = 数据类型::kINT64

template<> int8_t >

公共静态属性

static constexpr auto value = 数据类型::kINT8

template<> uint8_t >

公共静态属性

static constexpr auto value = 数据类型::kUINT8

template<typename T> struct TypeTraits<T*>

公共静态属性

static constexpr auto value = 数据类型::kINT64 

namespace runtime

version.h

namespace tensorrt_llm

namespace executor

变量

static auto constexpr kTensorRtLlmVersion = "0.14.0.dev2024100800"