运行时

bufferManager.h

namespace tensorrt_llm

namespace runtime

class BufferManager

#include <bufferManager.h>

用于管理主机和设备内存的辅助类。

公共类型

using IBufferPtr = IBuffer::UniquePtr 

using ITensorPtr = ITensor::UniquePtr 

using CudaStreamPtr = std::shared_ptr<CudaStream>

using CudaMemPoolPtr = std::shared_ptr<CudaMemPool>

公共函数

explicit BufferManager(CudaStreamPtr stream, bool trimPool = false)

构建一个BufferManager。

Parameters:: cudaStream – [in] 用于GPU上所有操作（分配、释放、复制等）的cuda流。

inline ~BufferManager(): 析构函数。

IBufferPtr gpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const: 在GPU上使用cudaMallocAsync分配一个给定大小的IBuffer。

ITensorPtr gpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const: 使用cudaMallocAsync在GPU上分配一个给定维度的ITensor。

IBufferPtr allocate(MemoryType memoryType, std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const: 分配一个给定大小和内存类型的IBuffer。

ITensorPtr allocate(MemoryType memoryType, nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const: 分配一个具有给定维度和内存类型的ITensor。

inline IBufferPtr emptyBuffer(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const: 创建一个给定内存类型的空IBuffer。它可以在以后调整大小。

inline ITensorPtr emptyTensor(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const: 创建一个给定内存类型的空ITensor。它可以在以后重新调整形状。

void setMem(IBuffer &buffer, int32_t value) const: 将给定的buffer的内容设置为值。

void setZero(IBuffer &buffer) const: 将给定的buffer内容设置为零。

void copy(void const *src, IBuffer &dst, MemoryType srcType) const: 将 src 复制到 dst。

void copy(IBuffer const &src, void *dst, MemoryType dstType) const: 将 src 复制到 dst。

inline void copy(void const *src, IBuffer &dst) const: 将 src 复制到 dst。

inline void copy(IBuffer const &src, void *dst) const: 将 src 复制到 dst。

void copy(IBuffer const &src, IBuffer &dst) const: 将 src 复制到 dst。

IBufferPtr copyFrom(IBuffer const &src, MemoryType memoryType) const: 将src复制到一个新的IBuffer中，可能具有不同的内存类型。

ITensorPtr copyFrom(ITensor const &src, MemoryType memoryType) const: 将src复制到一个新的ITensor中，可能具有不同的内存类型。

template<typename T> inline IBufferPtr copyFrom(std::vector<T> const &src, MemoryType memoryType) const: 将src复制到一个新的IBuffer中，可能具有不同的内存类型。

template<typename T> inline ITensorPtr copyFrom(T *src, nvinfer1::Dims dims, MemoryType memoryType) const: 将src复制到一个新的ITensor中，可能具有不同的内存类型。

template<typename T> inline ITensorPtr copyFrom(std::vector<T> const &src, nvinfer1::Dims dims, MemoryType memoryType) const: 将src复制到一个新的ITensor中，可能具有不同的内存类型。

CudaStream const &getStream() const: 获取底层的cuda流。

std::size_t memoryPoolReserved() const: 内存池当前保留的内存大小。

std::size_t memoryPoolUsed() const: 内存池当前使用的内存大小。

std::size_t memoryPoolFree() const: 内存池中当前空闲内存的大小。

void memoryPoolTrimTo(std::size_t size): 尝试将池保留的内存修剪到size字节。这会隐式地与流同步。

公共静态函数

static IBufferPtr gpuSync(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE): 在GPU上使用cudaMalloc分配一个给定大小的IBuffer。

static ITensorPtr gpuSync(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE): 使用cudaMalloc在GPU上分配一个给定维度的ITensor。

static IBufferPtr cpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE): 在CPU上分配一个给定大小的IBuffer。

static ITensorPtr cpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE): 在CPU上分配一个给定维度的ITensor。

static IBufferPtr pinned(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE): 在CPU上分配一个固定大小的IBuffer。

static ITensorPtr pinned(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE): 在CPU上分配一个固定大小的ITensor，其维度为给定值。

static IBufferPtr pinnedPool(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE): 在默认内存池中分配一个固定大小的IBuffer在CPU上。

static ITensorPtr pinnedPool(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE): 在默认内存池中，在CPU上分配一个固定大小的ITensor，其维度为给定值。

static IBufferPtr managed(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE): 在UVM中分配一个给定大小的IBuffer。

static ITensorPtr managed(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE): 在UVM中分配一个给定维度的ITensor。

公共静态属性

static auto constexpr kBYTE_TYPE = nvinfer1::DataType::kUINT8

私有成员

CudaStreamPtr mStream

CudaMemPoolPtr mPool

bool const mTrimPool

朋友们

friend class ::BufferManagerTest

common.h

定义

FMT_DIM

namespace tensorrt_llm

namespace runtime

类型定义

using SizeType32 = std::int32_t

using SizeType64 = std::int64_t

using TokenIdType = std::int32_t

using LoraTaskIdType = std::uint64_t

using TokenExtraIdType = std::uint64_t

using VecTokenExtraIds = std::vector<TokenExtraIdType>

using VecUniqueTokens = std::vector<UniqueToken>

template<typename T> using StringPtrMap = std::unordered_map<std::string, std::shared_ptr<T>>

枚举

enum class RequestType : std::int32_t

值：

enumerator kCONTEXT

enumerator kGENERATION

struct UniqueToken

公共函数

inline bool operator==(UniqueToken const &other) const noexcept

公共成员

TokenIdType tokenId

TokenExtraIdType tokenExtraId

cudaEvent.h

namespace tensorrt_llm

namespace runtime

class CudaEvent

公共类型

using pointer = cudaEvent_t

公共函数

inline explicit CudaEvent(unsigned int flags = cudaEventDisableTiming)

创建一个新的cuda事件。该事件将在析构函数中被销毁。

Parameters:: flags – 事件创建的标志。默认情况下，事件计时是禁用的。

inline explicit CudaEvent(指针 event, bool ownsEvent = true)

将一个现有的cuda事件传递给此对象。

Parameters:

event – 传递给此对象的事件。
ownsEvent – 此对象是否拥有该事件并在析构函数中销毁它。

inline 指针 get() const: 返回与此对象关联的事件。

inline void synchronize() const: 同步事件。

私有类型

using element_type = std::remove_pointer_t<指针>

using EventPtr = std::unique_ptr<element_type, 删除器>

私有成员

EventPtr mEvent

class Deleter

公共函数

inline explicit Deleter(bool ownsEvent)

inline explicit Deleter()

inline constexpr void operator()(pointer event) const

私有成员

bool mOwnsEvent

cudaStream.h

namespace tensorrt_llm

namespace runtime

class CudaStream

公共函数

inline explicit CudaStream(unsigned int flags = cudaStreamNonBlocking, int priority = 0)

在当前设备上创建一个新的cuda流。该流将在析构函数中被销毁。

Parameters:

flags – 用于流创建的标志。有关可以传递的有效标志列表，请参见 ::cudaStreamCreateWithFlags。
priority – 流的优先级。数字越小表示优先级越高。有关可以传递的有意义的流优先级的更多信息，请参见 ::cudaDeviceGetStreamPriorityRange。

inline explicit CudaStream(cudaStream_t stream, int device, bool ownsStream = true)

将一个现有的cuda流传递给此对象。

Parameters:

stream – 传递给此对象的流。
device – 创建流的设备。
ownsStream – 此对象是否拥有流并在析构函数中销毁它。

inline explicit CudaStream(cudaStream_t stream): 使用现有的cuda流或通过传递nullptr使用默认流进行构造。

inline int getDevice() const: 返回创建流的设备。

inline cudaStream_t get() const: 返回与此对象关联的流。

inline void synchronize() const: 同步流。

inline void record(CudaEvent::指针 event) const: 在流上记录一个事件。

inline void record(CudaEvent const &event) const: 在流上记录一个事件。

inline void wait(CudaEvent::指针 event) const: 等待一个事件。

inline void wait(CudaEvent const &event) const: 等待一个事件。

私有类型

using StreamPtr = std::unique_ptr<std::remove_pointer_t<cudaStream_t>, 删除器>

私有成员

StreamPtr mStream

int mDevice = {-1}

朋友们

friend class CudaStreamBindings

class Deleter

公共函数

inline explicit Deleter(bool ownsStream)

inline explicit Deleter()

inline constexpr void operator()(cudaStream_t stream) const

私有成员

bool mOwnsStream

decodingInput.h

namespace tensorrt_llm

namespace runtime

class DecodingInput

#include <decodingInput.h>

表示解码器的输入。

此输入类型被认为是不可变的。它表示解码器最初接收到的任何内容，并且始终可以这样引用。

公共类型

using TensorConstPtr = ITensor::SharedConstPtr 

using TensorPtr = ITensor::SharedPtr 

公共函数

inline DecodingInput(SizeType32 maxLength, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 batchSize, TensorConstPtr logits, TensorPtr endIds, TensorConstPtr batchSlots)

公共成员

SizeType32 step: 我们正在进行的解码步骤的索引。仅在Python运行时使用。

SizeType32 maxLength: 解码的最大令牌数。

SizeType32 maxAttentionWindow: 解码时考虑的最大注意力窗口长度。

SizeType32 sinkTokenLength: 用作注意力汇的令牌数量，如所述：https://arxiv.org/html/2309.17453v3

SizeType32 batchSize: 批次中的样本数量。

SizeType32 maxStopWordsLen: stopWordsLens 张量中的最大值。

SizeType32 maxBadWordsLen: badWordsLens 张量中的最大值。

TensorConstPtr logits: [batchSize, beamWidth, vocabSizePadded]，在GPU上。Logits是词汇表上的概率分布，模型的输出。

TensorConstPtr endIds: [batchSize * beamWidth], 在GPU上

TensorConstPtr batchSlots: [batchSize], 线性批次ID到序列槽的地址映射, int32_t, 固定

TensorConstPtr finishReasons: [batchSize, beamWidth], 当前迭代的完成状态。如果某些请求为真，则跳过其解码步骤，在GPU上

TensorConstPtr sequenceLimitLength: [batchSize], 在GPU上。批次中每个序列的最大序列长度。

TensorConstPtr embeddingBias: [batchSize, vocabSizePadded]，在GPU上

TensorConstPtr lengths: [batchSize, beamWidth]，在GPU上

std::vector<TensorPtr> badWordsLists

TensorConstPtr badWordsPtrs: [batchSize][2, badWordsLength], 在GPU上

TensorConstPtr badWordsLens: [batchSize]，在GPU上

std::vector<TensorPtr> stopWordsLists

TensorConstPtr stopWordsPtrs: [batchSize][2, stopWordsLength], 固定

TensorConstPtr stopWordsLens: [batchSize], 固定

TensorConstPtr noRepeatNgramSize: [batchSize]，在GPU上

TensorPtr cacheIndirection: [batchSize, beamWidth, maxSeqLen] - 用于光束搜索的k/v缓存索引，位于GPU上

std::optional<MedusaInputs> medusaInputs

std::optional<ExplicitDraftTokensInputs> explicitDraftTokensInputs

std::optional<LookaheadInputs> lookaheadInputs

std::optional<ExternalDraftTokensInputs> externalDraftTokensInputs

std::optional<EagleInputs> eagleInputs

struct EagleInputs

公共函数

inline EagleInputs(TensorConstPtr nextDraftTokens, TensorConstPtr nextDraftLens, TensorConstPtr nextDraftPaths, TensorConstPtr lastDraftTokens, TensorConstPtr lastDraftLens, TensorConstPtr lastDraftPaths, TensorConstPtr acceptedTokens, TensorConstPtr acceptedLens, TensorConstPtr acceptedPathIds, TensorConstPtr chunkedContextNextTokens, TensorConstPtr seqSlots)

公共成员

TensorConstPtr nextDraftTokens: [batchSize, maxDecodingDraftTokens]

TensorConstPtr nextDraftLens: [batchSize]

TensorConstPtr nextDraftPaths: [batchSize, maxDecodingTokens, maxPathLen]

TensorConstPtr lastDraftTokens: [batchSize, maxNumPaths, maxPathLen]

TensorConstPtr lastDraftLens: [batchSize]

TensorConstPtr lastDraftPaths: [batchSize, maxDecodingTokens, maxPathLen]

TensorConstPtr acceptedTokens: [batchSize, maxPathLen]

TensorConstPtr acceptedLens: [batchSize]

TensorConstPtr acceptedPathIds: [batchSize]

TensorConstPtr chunkedContextNextTokens: [batchSize]

TensorConstPtr seqSlots: [batchSize]

class ExplicitDraftTokensInputs

公共成员

TensorConstPtr nextDraftTokens: [batchSize, maxNumPaths, maxPathLen]

TensorConstPtr nextFlatTokens: [batchSize * maxDecodingTokens]

TensorConstPtr nextDraftIndices: [batchSize, maxNumPaths, maxPathLen]

TensorConstPtr nextDraftProbs: [batchSize, maxNumPaths, maxDraftPathLen, vocabSize]

TensorConstPtr lastDraftTokens: [batchSize, maxNumPaths, maxPathLen]

TensorConstPtr lastDraftIndices: [batchSize, maxNumPaths, maxPathLen]

TensorConstPtr masks: [batchSize, maxDecodingTokens, maxDecodingTokens], 布尔值

TensorConstPtr packedPositionIds: [batchSize * maxDecodingTokens]

TensorConstPtr bestPathLengths: [batchSize]

TensorConstPtr bestPathIndices: [batchSize]

TensorConstPtr nextGenerationLengths: [batchSize]

TensorConstPtr lastPositionIdsBase: [batchSize]

TensorConstPtr lastGenerationLengths: [batchSize]

TensorConstPtr maxGenLengthDevice: [1]

TensorConstPtr seqSlots: [batchSize]

class ExternalDraftTokensInputs

公共成员

TensorPtr draftLogits

TensorPtr draftProbs

TensorPtr targetProbs

TensorPtr numDraftTokens

TensorPtr draftTokenIds

TensorPtr useDraftLogits

TensorPtr useDraftLogitsHost

SizeType32 step

float constantThreshold

bool useRandomAcceptanceThreshold

struct LookaheadInputs

公共成员

TensorPtr tokensPerStep

class MedusaInputs

公共成员

TensorConstPtr medusaPaths: [batchSize, maxTokensPerStep, maxMedusaHeads + 1], 在GPU上

TensorConstPtr medusaTreeIds: [batchSize, maxTokensPerStep]，在GPU上

std::vector<std::vector<TensorPtr>> medusaLogits: [batchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded], 在GPU上

TensorPtr medusaCurTokensPerStep: [batchSize]，在GPU上

TensorConstPtr medusaTargetTokensPerStep: [batchSize]，在GPU上

decodingOutput.h

namespace tensorrt_llm

namespace batch_manager

namespace runtime

class DecodingOutput

公共类型

using TensorPtr = ITensor::SharedPtr 

公共函数

inline explicit DecodingOutput(TensorPtr ids, TensorPtr gatheredIds)

公共成员

TensorPtr ids

TensorPtr gatheredIds

TensorPtr newTokensSteps

TensorPtr newTokens

std::vector<TensorPtr> newTokensVec

TensorPtr finishReasons

TensorPtr finishedSum

TensorPtr logProbs

TensorPtr cumLogProbs

TensorPtr parentIds

TensorPtr lengths

TensorPtr cacheIndirection

TensorPtr logProbsTiled

BeamHypotheses beamHypotheses

std::optional<SpeculativeDecodingOutputs> speculativeDecodingOutputs

std::optional<ExplicitDraftTokensBuffers::输入> explicitDraftTokensBuffers

std::optional<前瞻解码缓冲区> lookaheadOutputs

std::optional<EagleBuffers::输入> eagleBuffers

公共静态属性

static float constexpr kNegativeInfinity = -1e20f

class BeamHypotheses

公共函数

void empty(BufferManager &manager)

void reshape(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxSequenceLength)

void release()

void init(BufferManager &manager, TokenIdType endId)

BeamHypotheses slice(SizeType32 batchIndex, SizeType32 size) const

公共成员

TensorPtr outputIdsCBA

TensorPtr logProbsCBA

TensorPtr sequenceLengthsCBA

TensorPtr cumLogProbsCBA

TensorPtr normedScoresCBA

TensorPtr numBeamsCBA

TensorPtr minNormedScoresCBA

TensorPtr batchDones

class SpeculativeDecodingOutputs

公共成员

TensorPtr nextDraftTokens

TensorPtr nextDraftTokensLen

TensorPtr prevDraftTokensLen

TensorPtr acceptedTokensLen

TensorPtr acceptedLengthsCumSum

TensorPtr pathsOffsets

eagleBuffers.h

namespace tensorrt_llm

namespace batch_manager

namespace runtime

class EagleBuffers

公共类型

using LlmRequestPtr = std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>

using RequestVector = std::vector<LlmRequestPtr>

using SizeType32 = runtime::SizeType32 

using ITensor = runtime::ITensor 

using BufferPtr = runtime::IBuffer::SharedPtr 

using TensorPtr = runtime::ITensor::SharedPtr 

using TensorMap = runtime::StringPtrMap<runtime::ITensor>

公共函数

EagleBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, runtime::BufferManager const &manager, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig, 执行器::DecodingConfig const &decodingConfig, runtime::TllmRuntime const &runtime)

void reshape(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ModelConfig const &modelConfig)

void setFromInputs(RequestVector const &contextRequests, RequestVector const &genRequests, runtime::ITensor const &requestTypes, ITensor const &seqSlots, EagleBuffers::输入 const &decoderBuffers, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig) const

void insertInputTensors(TensorMap &inputBuffers, TensorMap &outputBuffers, runtime::WorldConfig const &worldConfig) const

公共成员

输入 engineInputs

class tensorrt_llm::runtime::EagleBuffers::引擎输出 engineOutputs

私有函数

template<typename T> void setFromInputs(RequestVector const &contextRequests, RequestVector const &genRequests, SizeType32 vocabSizePadded, ITensor const &seqSlots, EagleBuffers::输入 const &draftBuffers, runtime::EagleModule const &eagleModule, runtime::BufferManager const &manager) const

私有成员

std::size_t scanTempStorageBytes = {0}

std::size_t reduceTempStorageBytes = {0}

float mDefaultPosteriorThreshold = {0.09f}

bool mDoGreedySampling = {true}

BufferPtr scanReduceTempStorage

TensorPtr cumSumGenerationLengths

TensorPtr maxGenerationLength

TensorPtr chunkedContextNextTokensHost

TensorPtr greedySamplingHost

TensorPtr posteriorAlphaHost

TensorPtr posteriorThresholdHost

class EngineOutputs

公共成员

TensorPtr nextDraftTokens: [batchSize, maxDecodingDraftTokens]

TensorPtr nextDraftLens: [batchSize]

TensorPtr nextDraftPaths: [batchSize, maxNumPaths, maxPathLen]

TensorPtr acceptedTokens: [batchSize, maxPathLen]

TensorPtr acceptedLens: [batchSize]

TensorPtr acceptedPaths: [batchSize]

TensorPtr chunkedContextNextTokens: [batchSize]

class Inputs

公共函数

void create(SizeType32 maxNumSequences, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig)

公共成员

TensorPtr temperatures: [maxBatchSize] 或 [numSequences]

TensorPtr posteriorAlpha: [maxBatchSize] 或 [numSequences]

TensorPtr posteriorThreshold: [maxBatchSize] 或 [numSequences]

TensorPtr randomDataSample: [maxBatchSize] 或 [numSequences]

TensorPtr randomDataValidation: [maxBatchSize, maxDecodingTokens] 或 [numSequences, maxDecodingTokens]

TensorPtr draftTokens: [maxBatchSize, maxDecodingDraftTokens] 或 [numSequences, maxDecodingDraftTokens]

TensorPtr draftLens: [maxBatchSize] 或 [numSequences]

TensorPtr draftPaths: [maxBatchSize, maxNumPaths, maxPathLen] 或 [numSequences, maxNumPaths, maxPathLen]

TensorPtr specDecodingGenerationLengths: [maxBatchSize] 或 [numGenSequences]

TensorPtr specDecodingGenerationLengthsHost: [maxBatchSize] 或 [numGenSequences]

TensorPtr specDecodingPackedMasks: [maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] 或 [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]

TensorPtr specDecodingPositionOffsets: [maxBatchSize] 或 [numGenSequences]

TensorPtr eagleNetCtxRequestTypesHost: [maxBatchSize] 或 [numSequences]

TensorPtr eagleNetCtxContextLengthsHost: [maxBatchSize] 或 [numSequences]

TensorPtr eagleNetCtxPastKeyValueLengthsHost: [maxBatchSize] 或 [numSequences]

TensorPtr eagleNetGenRequestTypesHost: [maxBatchSize] 或 [numSequences]

TensorPtr eagleNetGenContextLengthsHost: [maxBatchSize] 或 [numSequences]

TensorPtr eagleNetGenPastKeyValueLengthsHost: [maxBatchSize] 或 [numSequences]

TensorPtr inputGenTokensHost: [maxBatchSize * maxDecodingTokens] 或 [numSequences * maxDecodingTokens]

TensorPtr chunkedContextNextTokens: [maxBatchSize] 或 [numSequences]

TensorPtr useDynamicTreeHost: [1]

explicitDraftTokensBuffers.h

namespace tensorrt_llm

namespace runtime

class ExplicitDraftTokensBuffers

公共类型

using SizeType32 = runtime::SizeType32 

using ITensor = runtime::ITensor 

using BufferPtr = runtime::IBuffer::SharedPtr 

using TensorPtr = runtime::ITensor::SharedPtr 

using TensorMap = runtime::StringPtrMap<runtime::ITensor>

公共函数

ExplicitDraftTokensBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, runtime::BufferManager const &manager, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig, 执行器::DecodingConfig const &decodingConfig, runtime::TllmRuntime const &runtime)

void reshape(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ModelConfig const &modelConfig)

void setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ITensor const &requestTypes, ITensor const &seqSlots, ExplicitDraftTokensBuffers::输入 const &decoderBuffers, ITensor const &contextPositionIds, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig) const

void insertInputTensors(TensorMap &inputBuffers, TensorMap &outputBuffers, runtime::WorldConfig const &worldConfig) const

公共成员

tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs engineInputs

class tensorrt_llm::runtime::ExplicitDraftTokensBuffers::引擎输出 engineOutputs

std::size_t scanTempStorageBytes = {0}

BufferPtr scanTempStorage

TensorPtr cumSumGenerationLengths

私有函数

template<typename T> void setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, SizeType32 vocabSizePadded, ITensor const &seqSlots, ExplicitDraftTokensBuffers::输入 const &draftBuffers, ITensor const &contextPositionIds, runtime::ExplicitDraftTokensModule const &explicitDraftTokensModule, runtime::CudaStream const &stream) const

class EngineInputs : public tensorrt_llm::runtime::ExplicitDraftTokensBuffers::输入 

公共成员

TensorPtr requestTypesDevice: [numSequences]，在GPU上

TensorPtr positionOffsets: [numGenSequences]

class EngineOutputs

公共成员

TensorPtr nextGenerationLengths: [batchSize]

TensorPtr nextPositionOffsets: [batchSize]

TensorPtr masks: [batchSize, maxDecodingTokens, maxDecodingTokens], 布尔值

TensorPtr nextDraftTokens: [batchSize, maxNumPaths, maxPathLen]

TensorPtr nextDraftIndices: [batchSize, maxNumPaths, maxPathLen]

TensorPtr nextDraftProbs: [batchSize, maxNumPaths, maxDraftPathLen, vocabSize]

TensorPtr nextFlatTokens: [batchSize * maxDecodingTokens]

TensorPtr bestPathLengths: [batchSize]

TensorPtr bestPathIndices: [batchSize]

TensorPtr maxGenToken: [1]

TensorPtr totalGenToken: [1]

TensorPtr packedPositionIds: [batchSize * maxDecodingTokens]

class Inputs

由tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs子类化

公共函数

void create(SizeType32 maxNumSequences, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig)

公共成员

TensorPtr temperatures: [maxBatchSize]

TensorPtr positionIdsBase: [maxBatchSize]

TensorPtr generationLengths: [maxBatchSize] 或 [numGenSequences]

TensorPtr randomDataSample: [maxBatchSize]

TensorPtr randomDataValidation: [maxBatchSize, maxNumPaths, maxPathDraftLen] 或 [numGenSequences, maxNumPaths, maxPathDraftLen]

TensorPtr draftTokens: [maxBatchSize, maxNumPaths, maxPathLen] 或 [numGenSequences, maxNumPaths, maxPathLen]

TensorPtr draftIndices: [maxBatchSize, maxNumPaths, maxPathLen] 或 [numGenSequences, maxNumPaths, maxPathLen]

TensorPtr draftProbs: [maxBatchSize, maxNumPaths, maxPathDraftLen, vocabSize] 或 [numGenSequences, maxNumPaths, maxPathDraftLen, vocabSize]

TensorPtr packedMasks: [maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] 或 [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]

TensorPtr positionIds: [maxBatchSize] 或 [numGenSequences]

TensorPtr maxGenLengthHost

TensorPtr generationLengthsHost

generationInput.h

namespace tensorrt_llm

namespace runtime

class GenerationInput : public tensorrt_llm::runtime::GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>

公共类型

using Base = GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>

using TensorPtr = 基础::TensorPtr

公共函数

inline explicit GenerationInput(SizeType32 const endId, SizeType32 const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)

template<typename TTensor, typename PromptTuningParams> class GenericGenerationInput

#include <generationInput.h>

endId, is the token ID that marks the end of the input sequence (aka EOS or end-of-sequence). It’s 50,256 for the GPT2 model which has a vocabulary of 50,257 tokens, for example,
padId, is the token ID that is used for padding (i.e. fills in the slots that are at an index greater-or-equal to the input length for padded sequences). It can be set to the same value as endId,
ids, is the tensor of input IDs. That tensor must be allocated on the GPU. When the input tensor is padded, the shape of ids is [batchSize, maxInputLength], where batchSize and maxInputLength must respect the maximum sizes in sessionConfig passed to the GptSession constructor. When the input is packed, the shape of ids is [numTokens], where numTokens is the sum of the lengths of the different sequences in the batch,
lengths, is the tensor of input sequence lengths. That tensor must be allocated on the GPU and contain batchSize values,
packed, indicates if the ids tensor is packed or padded. In this release, that flag must match the value passed to the constructor through the instance of the ModelConfig class. In a future release, the session may be made more flexible and automatically pad or pack the input,

embeddingBiasOpt，是GPU上的浮点值张量，包含在采样期间添加到logits的偏差（在从隐藏状态到logits的投影之后，作为模型的最后一步）。此张量必须具有vocabSize个元素（如传递给构造函数的modelConfig参数中所定义），
badWordsList，是GPU上的整数张量，编码了必须从生成序列中禁止的单词列表。其形状为[2, badWordsLength]，如下所述，或者当批次中每个序列有不同的列表时为[batchSize, 2, badWordsLength]，
stopWordsList，是GPU上的整数张量，编码了触发序列生成结束的单词列表。其形状为[2, stopWordsLength]，如下所述，或者当批次中每个序列有不同的列表时为[batchSize, 2, stopWordsLength]，
maxNewTokens，是要生成的最大令牌数。

badWordsList 和 stopWordsList 张量具有相同的形状 [2, length]。让我们考虑一个包含三个单词的示例来描述这些列表的表示。第一个单词包含标记 [5, 7, 3]，第二个单词包含 [9, 2]，第三个单词由标记 [6, 2, 4, 1] 组成。总共有9个标记。这就是长度。张量的形状是 [2, 9]。张量的第一行必须包含9个标记ID，第二行必须存储单词长度的包含前缀和，如下图所示：

   0           3       5              9
   |           |       |              |
   V           V       V              V
[  5,  7,  3,  9,  2,  6,  2,  4,  1]
[  3,  5,  9, -1, -1, -1, -1, -1, -1]

如果所有单词都由单个标记组成，则张量的最内层维度必须增加1（即，对于由单个标记组成的4个单词，长度必须为5而不是4 — 形状为[2, 5]）。

公共类型

using TensorPtr = TTensor 

公共函数

inline explicit GenericGenerationInput(SizeType32 const endId, SizeType32 const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)

公共成员

SizeType32 endId

SizeType32 padId

TensorPtr ids

TensorPtr lengths

bool packed

TensorPtr embeddingBias

TensorPtr badWordsList

TensorPtr stopWordsList

std::optional<SizeType32> maxNewTokens

PromptTuningParams promptTuningParams

generationOutput.h

namespace tensorrt_llm

namespace runtime

class GenerationOutput : public tensorrt_llm::runtime::GenericGenerationOutput<ITensor::SharedPtr>

公共类型

using Base = GenericGenerationOutput<ITensor::SharedPtr>

using TensorPtr = 基础::TensorPtr

公共函数

inline explicit GenerationOutput(TensorPtr ids, TensorPtr lengths)

template<typename TTensor> class GenericGenerationOutput

#include <generationOutput.h>

ids，是一个包含输出令牌ID的张量。它的形状是[batchSize, beamWidth, maxSeqLength]，其中maxSeqLength是maxInputLength和maxNewTokens的总和。生成后，它包含每个序列的输入令牌副本，后跟输出令牌。当序列短于maxSeqLength时，填充令牌会添加到序列的末尾。

请注意，这个版本的TensorRT-LLM中该张量的形状与之前版本中的形状不同，之前版本中的形状是。

logProbs, is a tensor of floating-point values on the GPU to store the log-prob of the generated tokens. Its shape is [maxNewTokens, batchSize, beamWidth]. Its shape will likely change in a future release to match the shape of the output ids tensor.
contextLogits, is a tensor of values on the GPU (same datatype as the computation type) to store the logits for the context. Its shape is [batchSize, maxSequenceLength, vocabSizePadded]. If use remove_input_padding, its shape is [packedSize, vocabSizePadded]. This buffer will only be filled in if the TensorRT engine was built with the gather_context_logits or gather_all_token_logits parameter enabled.

After inference is complete, you can get the context logits in GenerationOutput.contextLogits, these are variables on the GPU. For specific acquisition methods, please refer to the example of gptSessionBenchmark.cpp.

It is important to point out that enabling the computation may have an impact on performance (the language modeling head (LM head) has to perform a matrix multiplication on all the context tokens instead of a just the last one).
generationLogits, is a tensor of values on the GPU (same datatype as the computation type) to store the logits for the generation. Its shape is [batchSize, beamWidth, maxOutputLen, vocabSizePadded]. This buffer will only be filled in if the TensorRT engine was built with the gather_generation_logits or gather_all_token_logits parameter enabled.

Generation logits can also be obtained through GenerationOutput.generationLogits after inference is completed.
onTokenGenerated, is a callback function invoked in the generation loop to pass newly generated tokens to the caller while the loop continues to execute. An implementation of that callback must accept the output ids tensor, the generation step and a boolean flag that indicates if the generation is complete.

公共类型

using TensorPtr = TTensor 

using Callback = std::function<void(TensorPtr const &ids, SizeType32 step, bool finished)>

公共函数

inline explicit GenericGenerationOutput(TensorPtr ids, TensorPtr lengths)

公共成员

TensorPtr ids

TensorPtr lengths

TensorPtr cumLogProbs

TensorPtr logProbs

TensorPtr contextLogits

TensorPtr generationLogits

回调 onTokenGenerated

gptDecoder.h

namespace tensorrt_llm

namespace layers

namespace runtime

函数

inline runtime::ITensor::SharedConstPtr getDefaultBatchSlots(runtime::SizeType32 batchSize): 辅助函数，用于为未明确提供批次槽给解码器的路径生成批次槽 [0, 1, …, batchSize - 1]。

template<typename T> class GptDecoder : public virtual tensorrt_llm::runtime::IGptDecoder 

公共类型

using CudaStreamPtr = BufferManager::CudaStreamPtr 

using TensorPtr = std::shared_ptr<ITensor>

公共函数

GptDecoder(执行器::解码模式 const &mode, size_t maxBatchSize, size_t maxBeamWidth, size_t vocabSize, size_t vocabSizePadded, size_t maxSequenceLength, CudaStreamPtr const &stream, std::shared_ptr<SpeculativeDecodingModule const> speculativeDecodingModule = nullptr)

virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize, TensorConstPtr const &batchSlots, std::optional<解码输出> const &output = std::nullopt, std::optional<std::vector<decoder_batch::请求> const> const &requests = std::nullopt) override

virtual void forwardAsync(解码输出 &output, 解码输入 const &input) override

virtual void forwardSync(解码输出 &output, 解码输入 const &input) override

inline virtual SamplingConfig const &getSamplingConfig() override

私有成员

std::shared_ptr<BufferManager> mManager

std::shared_ptr<tensorrt_llm::layers::DynamicDecodeLayer<T>> mDynamicDecodeLayer

std::shared_ptr<tensorrt_llm::runtime::DecodingLayerWorkspace> mDecodingLayerWorkspace

SamplingConfig mSamplingConfig

size_t mMaxBatchSize

执行器::解码模式 mDecodingMode

class IGptDecoder

由tensorrt_llm::runtime::GptDecoder< T >子类化

公共类型

using TensorPtr = runtime::ITensor::SharedPtr 

using TensorConstPtr = runtime::ITensor::SharedConstPtr 

公共函数

virtual ~IGptDecoder() = default

virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize, TensorConstPtr const &batchSlots, std::optional<解码输出> const &output = std::nullopt, std::optional<std::vector<decoder_batch::请求> const> const &requests = std::nullopt) = 0

virtual void forwardAsync(解码输出 &output, 解码输入 const &input) = 0

virtual void forwardSync(解码输出 &output, 解码输入 const &input) = 0

virtual SamplingConfig const &getSamplingConfig() = 0

公共静态函数

static inline std::unique_ptr<IGptDecoder> create(执行器::解码模式 const &mode, nvinfer1::DataType dtype, size_t maxBatchSize, size_t maxBeamWidth, size_t vocabSize, size_t vocabSizePadded, size_t maxSequenceLength, BufferManager::CudaStreamPtr const &stream, std::shared_ptr<SpeculativeDecodingModule const> const &speculativeDecodingModule = nullptr)

gptDecoderBatched.h

namespace tensorrt_llm

namespace runtime

class GptDecoderBatched : public tensorrt_llm::runtime::IGptDecoderBatched 

#include <gptDecoderBatched.h>

支持飞行中批处理的GPT解码器类。

公共类型

enum class ForwardType

值：

enumerator kASYNC

enumerator kSYNC

using CudaStreamPtr = std::shared_ptr<CudaStream>

using TensorPtr = ITensor::SharedPtr 

using SharedConstPtr = ITensor::SharedConstPtr 

公共函数

GptDecoderBatched(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream, SpeculativeDecodingMode const &speculativeDecodingMode, nvinfer1::DataType dtype)

virtual void setup(执行器::解码模式 const &mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, SizeType32 maxTokensPerStep, nvinfer1::DataType dtype, ModelConfig const &modelConfig) override: 在调用forward()之前设置解码器

virtual void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::输入 explicitDraftTokensBuffers) override: 为ExplicitDraftTokens解码设置缓冲区。

virtual void setupEagle(EagleBuffers::输入 eagleBuffers) override: 为Eagle解码设置缓冲区。

virtual void setupLookahead(前瞻解码缓冲区 lookaheadDecodingBuffers) override: 为Lookahead解码设置缓冲区。

virtual void newBatch(生成输入 const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig) override: 使用新一批输入初始化解码器。

virtual void newRequests(std::vector<SizeType32> const &seqSlots, std::vector<decoder_batch::请求> const &requests, std::vector<SamplingConfig> const &samplingConfigs, ModelConfig const &modelConfig) override: 在seqSlots处使用新的requests初始化批量解码器。

virtual DecoderFinishedEventPtr forwardAsync(decoder_batch::输出 &output, decoder_batch::输入 const &input) override: 运行所有请求的一步，而不阻塞主机进程，并返回用于同步的令牌。

virtual void forwardSync(decoder_batch::DecoderFinishedEvent const &decoderFinishEvent) override: 等待与令牌关联的forwardAsync调用完成。

virtual void forwardSync(decoder_batch::DecoderFinishedEvent const &decoderFinishEvent, decoder_batch::输出 &output, decoder_batch::输入 const &input) override: 调用解码器的forwardSync并等待与令牌关联的forwardAsync调用完成。

virtual void forwardAsync(解码器::输出 &output, 解码器::输入 const &input) override: 为所有请求运行一步，而不阻塞主线程。

virtual void forwardSync() override: 等待最后一次调用 forwardAsync 完成。

inline virtual std::vector<bool> getFinished() const override

Returns:: [batchSize], 已完成请求的指示器

inline virtual TensorPtr getFinishReasons() const override

Returns:: [batchSize, beamWidth], FinishedState 值，在 GPU 上

inline virtual TensorPtr getIds(SizeType32 batchIdx) const override

Parameters:: batchIdx – 批次的索引
Returns:: [maxBeamWidth, maxInputLength + maxNewTokens]，包含请求batchIdx的输入令牌ID和生成的令牌ID，没有填充，位于GPU上。在束搜索的情况下，包含未收集的数据。

inline virtual TensorPtr getIds() const override

Returns:: [batchSize, maxBeamWidth, maxInputLength + maxNewTokens]，包含输入令牌ID和生成的令牌ID，没有填充，位于GPU上。在束搜索的情况下，包含未聚集的数据。

inline virtual TensorPtr getGatheredIds(SizeType32 batchIdx) const override

Parameters:: batchIdx – 批次的索引
Returns:: [batchSize, maxBeamWidth, maxInputLength + maxNewTokens]，仅用于beam search。它包含为请求batchIdx收集的未填充的token ids，位于gpu上。

inline virtual TensorPtr getGatheredIds() const override

Returns:: [batchSize, maxBeamWidth, maxInputLength + maxNewTokens]，仅用于beam search。它包含在gpu上收集的无填充的token ids。

virtual CudaEvent finalize(SizeType32 batchSlot, SamplingConfig const &samplingConfig, bool streaming) const override: 为请求 batchSlot 收集最终的波束搜索结果。结果只有在事件返回后才会可用。

virtual void finalize(SamplingConfig const &samplingConfig) const override: 收集所有请求的最终波束搜索结果。

inline virtual TensorPtr getParentIds() const override

Returns:: [batchSize, maxBeamWidth, maxInputLength + maxNewTokens]，包含在GPU上收集的未填充的beam搜索期间的父ID

inline virtual TensorPtr getCumLogProbs() const override

Returns:: [batchSize, maxBeamWidth], 累积对数概率（每束），在GPU上

inline virtual TensorPtr getCumLogProbs(SizeType32 batchIdx) const override

Returns:: [maxBeamWidth], 累积对数概率（每束），在GPU上

inline virtual TensorPtr getLogProbs() const override

Returns:: [batchSize, maxBeamWidth, maxSequenceLength], 对数概率（每束），在GPU上

inline virtual TensorPtr getLogProbs(SizeType32 batchIdx) const override

Returns:: [maxBeamWidth, maxSequenceLength], 对数概率（每束），在GPU上

inline virtual TensorPtr getAllNewTokens() const override

获取在上一次前向传递中生成的maxTokensPerStep令牌。

Returns:: [maxTokensPerStep, batchSize, maxBeamWidth], 上次前向传递生成的tokens，在gpu上

inline virtual TensorPtr getNewTokens(SizeType32 iter = 0) const override

获取在上一次前向传播步骤中生成的令牌。

Parameters:: iter – 在 [0; maxTokensPerStep) 范围内获取令牌的迭代次数
Returns:: [batchSize, beamWidth], 在 iter 中生成的 tokens（每束），在 gpu 上

inline virtual std::vector<SizeType32> getNbSteps() const override

Returns:: [batchSize]，每次请求上执行的生成步骤数量

inline virtual TensorPtr getNbFinished() const override

Returns:: [1], 已完成序列的数量，位于固定的主机内存中

inline virtual TensorPtr getNextDraftTokens() const override

Returns:: [batchSize, maxDraftTokens], 预测下一步的草稿令牌，在GPU上

inline virtual TensorPtr getPrevDraftTokensLengths() const override

Returns:: [batchSize]，上一步预测的草稿令牌长度，在GPU上

inline virtual TensorPtr getNextDraftTokensLengths() const override

Returns:: [batchSize], 预测下一步的草稿令牌长度，在GPU上

inline virtual TensorPtr getAcceptedLengthsCumSum() const override

Returns:: [batchSize + 1]，接受的草稿令牌长度的独占和，在GPU上

inline virtual TensorPtr getAcceptedPackedPaths() const override

Returns:: [batchSize, maxAcceptedDraftTokensPerStep], 接受的路径打包成连续的张量，在GPU上

inline virtual 执行器::解码模式 getDecodingMode() const override

私有类型

using GptDecoderPtr = std::unique_ptr<IGptDecoder>

using DecodingInputPtr = std::unique_ptr<解码输入>

using DecodingOutputPtr = std::unique_ptr<解码输出>

私有函数

CudaEvent postProcessRequest(SizeType32 batchIdx, SamplingConfig const &samplingConfig, bool streaming) const: 为请求 batchIdx 收集最终的波束搜索结果。

void newRequest(SizeType32 batchSlot, decoder_batch::请求 const &request, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig): 在batchSlot处使用新的request初始化解码器。

void allocateSpeculativeDecodingBuffers(nvinfer1::DataType dtype): 为推测性解码分配缓冲区。

void setupSpeculativeDecoding(ModelConfig const &modelConfig): 为推测解码设置缓冲区。

void setupLookahead(ModelConfig const &modelConfig): 为前瞻解码设置缓冲区。

void newRequestSpeculativeDecoding(SizeType32 batchIdx, decoder_batch::请求 const &request, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig): 为新推测解码请求设置解码器内部张量。

void newRequestDraftTokensExternal(SizeType32 batchIdx, decoder_batch::请求 const &request, SamplingConfig const &samplingConfig): 为Draft模型Sps模式中的新请求设置解码器内部张量。

void newRequestMedusa(SizeType32 batchIdx, decoder_batch::请求 const &request): 为新Medusa请求设置解码器内部张量。

void newRequestLookahead(SizeType32 batchIdx, decoder_batch::请求 const &request): 为新Lookahead请求设置解码器内部张量。

void newRequestExplicitDraftTokens(SizeType32 batchIdx, decoder_batch::请求 const &request): 为新显式草稿令牌请求设置解码器内部张量。

void newRequestEagle(SizeType32 batchIdx, decoder_batch::请求 const &request, ModelConfig const &modelConfig): 为新Eagle请求设置解码器内部张量。

void updateFinished(decoder_batch::DecoderFinishedEvent const &decoderFinishEvent): 更新主机上所有活动请求的完成状态。

void setExplicitDraftTokensInputs(decoder_batch::输入 const &input): 设置显式草稿令牌的输入。

void setEagleInputs(decoder_batch::输入 const &input): 设置用于eagle解码的输入。

void forwardDispatch(decoder_batch::输出 &output, decoder_batch::输入 const &input, ForwardType forwardType): 为每个引擎步骤调用解码器以处理令牌。

void forwardDecoder(SizeType32 step, decoder_batch::输出 &output, decoder_batch::输入 const &input, ForwardType forwardType): 为整个批次调用解码器。

私有成员

std::size_t const mVocabSize

std::size_t const mVocabSizePadded

CudaStreamPtr mRuntimeStream

CudaStreamPtr mDecoderStream

BufferManager mBufferManager

DecoderFinishedEventPtr mDecoderFinishEvent

CudaEvent mForwardEvent

GptDecoderPtr mDecoder

解码输入指针 mJointDecodingInput

DecodingOutputPtr mJointDecodingOutput

std::vector<SizeType32> mNbSteps

std::vector<bool> mFinished

TensorPtr mFinishedSum

std::vector<SizeType32> mMaxNewTokens

std::vector<SizeType32> mBeamWidths

std::vector<SizeType32> mNumDecodingEngineTokens

TensorPtr mFinishedSteps

TensorPtr mBatchSlotsSetup

TensorPtr mBatchSlotsDecoder

SizeType32 mMaxSequenceLength = {}

SizeType32 mMaxAttentionWindow = {}

SizeType32 mSinkTokenLength = {}

SizeType32 mActualBatchSize = {}

SizeType32 mMaxDecodingDecoderTokens = {}

SizeType32 mMaxDecodingEngineTokens = {}

SpeculativeDecodingMode mSpeculativeDecodingMode

执行器::解码模式 mDecodingMode = {执行器::解码模式::自动()}

std::shared_ptr<解码输出::BeamHypotheses> mOutputBeamHypotheses = {nullptr}

解码输出::TensorPtr mCumLogProbsTmp

SizeType32 mNumSMs

gptJsonConfig.h

namespace tensorrt_llm

namespace runtime

class GptJsonConfig

公共函数

inline GptJsonConfig(std::string name, std::string version, std::string precision, SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 contextParallelism, SizeType32 gpusPerNode, ModelConfig modelConfig, std::optional<RuntimeDefaults> runtimeDefaults = std::nullopt)

inline ModelConfig const &getModelConfig() const

inline ModelConfig &getModelConfigMutable()

inline std::string const &getName() const

inline std::string const &getVersion() const

inline std::string const &getPrecision() const

inline SizeType32 constexpr getTensorParallelism() const

inline SizeType32 constexpr getPipelineParallelism() const

inline SizeType32 constexpr getContextParallelism() const

inline SizeType32 constexpr getGpusPerNode() const

inline SizeType32 constexpr getWorldSize() const

inline std::optional<RuntimeDefaults> getRuntimeDefaults() const

std::string engineFilename(WorldConfig const &worldConfig, std::string const &model) const

inline std::string engineFilename(WorldConfig const &worldConfig) const

公共静态函数

static GptJsonConfig parse(std::string const &json)

static GptJsonConfig parse(std::istream &json)

static GptJsonConfig parse(std::filesystem::path const &path)

私有成员

std::string const mName

std::string const mVersion

std::string const mPrecision

SizeType32 const mTensorParallelism

SizeType32 const mPipelineParallelism

SizeType32 const mContextParallelism

SizeType32 const mGpusPerNode

ModelConfig mModelConfig

std::optional<RuntimeDefaults> mRuntimeDefaults

gptSession.h

namespace tensorrt_llm

namespace batch_manager

namespace kv_cache_manager

namespace runtime

class GptSession

公共类型

using LoggerPtr = std::shared_ptr<nvinfer1::ILogger>

公共函数

GptSession(配置 const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, RawEngine const &rawEngine, LoggerPtr logger = nullptr)

Parameters:

sessionConfig – 会话的配置，
modelConfig – 模型的描述，
worldConfig – 环境的描述，
rawEngine – 编译后的TensorRT引擎，
logger – 可选的日志记录器。

inline GptSession(配置 const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, void const *engineBuffer, std::size_t engineSize, LoggerPtr logger = nullptr)

inline GptSession(配置 const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::vector<uint8_t> const &engineBuffer, LoggerPtr logger = nullptr)

GptSession(配置 const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::string const &engineFile, LoggerPtr logger = nullptr)

nvinfer1::ILogger &getLogger() const

BufferManager const &getBufferManager() const

BufferManager::CudaStreamPtr getRuntimeStreamPtr() const

inline ModelConfig const &getModelConfig() const

inline WorldConfig const &getWorldConfig() const

inline int getDevice() const noexcept

inline bool getNormalizeLogProbs() const noexcept

nvinfer1::IEngineInspector &getEngineInspector() const

nvinfer1::DataType getLogitDataType() const

void generate(GenerationOutput &outputs, 生成输入 const &inputs, SamplingConfig const &samplingConfig, std::shared_ptr<GenerationProfiler> const generationProfiler = nullptr)

此函数执行生成循环。

给定要读取的输入张量，填充的输出张量，可以生成该成员函数或每个序列已达到完成（由于生成将运行生成循环，直到达到最大标记数，即“序列结束”或“停止词”列表中的单词）。该函数的伪代码如下（成员函数名称已更改以简化演示）：

// Have all the sequences in the batch reached completion?
bool allFinished = false;

// Until all sequences are finished or the number of steps reaches the limit...
for (int step = 0; !allFinished && step < maxNewTokens; ++step) {

// Trigger the computation of the logits...
computeLogits(...);

// Run the sampling to produce a token (for each active sequence) from the logits.
allFinished = generateTokensFromLogits(...);

// Callback to stream the output tokens while the generation loop continues.
onTokenGenerated(...);
}

void setLayerProfiler(): 设置LayerProfiler以收集每层的性能。

std::string getLayerProfileInfo() const: 打印每层的配置文件信息。

私有类型

using BaseKVCacheManager = batch_manager::kv_cache_manager::BaseKVCacheManager

using KvCacheConfig = batch_manager::kv_cache_manager::KvCacheConfig

using TensorPtr = runtime::ITensor::SharedPtr 

using TokenGeneratedCallback = std::function<void(SizeType32 step, bool finished)>

私有函数

inline bool useCudaGraphs()

void generateBatched(std::vector<GenerationOutput> &microBatchesOutputs, std::vector<生成输入> const &microBatchesInputs, SamplingConfig const &samplingConfig, TokenGeneratedCallback const &onTokenGenerated, std::shared_ptr<GenerationProfiler> const generationProfiler)

void setup(配置 const &sessionConfig)

void createContexts()

void createBuffers(SizeType32 numMicroBatches)

void createDecoders(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, nvinfer1::DataType logitsType, bool decoderPerRequest, SizeType32 numMicroBatches, 执行器::解码模式 const &decodingMode)

void createKvCacheManager(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, KvCacheConfig const &config)

void createCustomAllReduceWorkspace(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxSequenceLength)

void executeContextStep(std::vector<生成输入> const &generationBatchesInputs, std::vector<SizeType32> const &generationBatchesOffsets, BaseKVCacheManager const *kvCacheManager)

SizeType32 executeGenerationStep(SizeType32 step, std::vector<生成输入> const &microBatchesInputs, std::vector<GenerationOutput> &microBatchesOutputs, std::vector<SizeType32> const &microBatchOffsets, BaseKVCacheManager *kvCacheManager, std::vector<bool> &microBatchesFinished)

void decoderStepAsync(SizeType32 decoderStep, SizeType32 microBatchId): 在最后一个PP等级上执行解码器，在其他PP等级上接收解码器输出。

bool shouldStopSync(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 microBatchId): 与解码器同步并返回shouldStop标志。

void finalize(SizeType32 microBatchId, SamplingConfig const &samplingConfig)

收集最终输出ID和最后一个PP等级的日志概率，并将它们发送到第一个PP等级。

在主机上接收是异步的，因此在访问之前需要同步。

void kvCacheAddSequences(SizeType32 beamWidth, SizeType32 microBatchId, SizeType32 firstBatchIdx)

ITensor::SharedPtr initDecoder(ITensor &outputIds, 生成输入 const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, SizeType32 microBatchId) const: 填充outputIds并返回对新Tokens张量的引用。

TokenGeneratedCallback createOnTokenGeneratedCallback(GenerationOutput &outputs)

bool shouldUseKVCacheManager() const

私有成员

ModelConfig const mModelConfig

WorldConfig const mWorldConfig

int mDevice = {-1}

std::shared_ptr<NcclCommunicator> mPipelineComm

std::shared_ptr<CudaStream> mCommStream

CudaEvent mCommEvent = {}

std::shared_ptr<AllReduceBuffers> mAllReduceBuffers

SizeType32 mDecoderMaxSequenceLength = {}

std::vector<SizeType32> mDecoderMaxAttentionWindowVec = {}

SizeType32 mDecoderMaxAttentionWindow = {}

SizeType32 mDecoderSinkTokenLength = {}

LoggerPtr mLogger

std::shared_ptr<TllmRuntime> mRuntime

std::shared_ptr<BaseKVCacheManager> mKvCacheManager

MicroBatchConfig mMicroBatchConfig

std::vector<std::shared_ptr<IStatefulGptDecoder>> mDecoders

std::vector<std::shared_ptr<RuntimeBuffers>> mBuffers

std::vector<CudaEvent> mReceivedEvents

bool mCudaGraphMode = {false}

std::vector<CudaGraphExecutor> mCudaGraphInstances

bool mNormalizeLogProbs = true

朋友们

friend class batch_manager::TrtGptModelV1

class Config

#include <gptSession.h>

会话执行和缓冲区大小的配置。generate 可以使用比配置参数更小的批处理大小和波束宽度来调用。

maxBatchSize 将被微批次数除，以初始化每个批处理缓冲区。

公共函数

inline Config(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxSequenceLength, float gpuWeightsPercent = 1.0)

公共成员

SizeType32 maxBatchSize

SizeType32 maxBeamWidth

SizeType32 maxSequenceLength

float gpuWeightsPercent

bool decoderPerRequest = {false}

bool cudaGraphMode = {false}

KvCacheConfig kvCacheConfig = {}

std::optional<SizeType32> ctxMicroBatchSize = std::nullopt

std::optional<SizeType32> genMicroBatchSize = std::nullopt

std::optional<执行器::解码模式> decodingMode = std::nullopt

bool normalizeLogProbs = true

class CudaGraphExecutor

公共函数

CudaGraphExecutor() = default

inline ~CudaGraphExecutor()

inline bool hasInstance()

void clear()

void prepareNextGraph(TllmRuntime const &runtime, SizeType32 nextContextId)

void launch(CudaStream const &stream)

私有函数

void create(cudaGraph_t const &graph)

bool update(cudaGraph_t const &graph)

void uploadToStream(CudaStream const &stream)

私有成员

cudaGraphExec_t mInstance

class GenerationProfiler

#include <gptSession.h>

可选的性能分析类，用于分析推理请求生成阶段的性能。

公共函数

inline GenerationProfiler()

inline CudaEvent const &getStart() const

inline CudaEvent const &getEnd() const

inline float getElapsedTimeMs()

公共静态属性

static constexpr unsigned int flags = {cudaEventDefault}

私有成员

CudaEvent start

CudaEvent end

class MicroBatchConfig

公共函数

inline MicroBatchConfig()

explicit MicroBatchConfig(SizeType32 maxBatchSize, SizeType32 pipelineParallelism, std::optional<SizeType32> genMicroBatchSize, std::optional<SizeType32> ctxMicroBatchSize)

inline constexpr SizeType32 numCtxPerGen() const

inline constexpr SizeType32 getGenGraphId(SizeType32 flipFlopId, SizeType32 generationBatchId) const: 在每一代批次之间在两个图实例之间切换。

公共成员

SizeType32 numCtxBatches

SizeType32 numGenBatches

SizeType32 ctxBatchSize

SizeType32 genBatchSize

namespace utils

函数

std::vector<uint8_t> loadEngine(std::string const &enginePath)

iBuffer.h

template<> struct MemoryTypeString<MemoryType::kGPU>

公共静态属性

static auto constexpr value = "GPU"

template<> struct MemoryTypeString<MemoryType::kCPU>

公共静态属性

static auto constexpr value = "CPU"

template<> struct MemoryTypeString<MemoryType::kPINNED>

公共静态属性

static auto constexpr value = "PINNED"

template<> struct MemoryTypeString<MemoryType::kUVM>

公共静态属性

static auto constexpr value = "UVM"

template<> struct MemoryTypeString<MemoryType::kPINNEDPOOL>

公共静态属性

static auto constexpr value = "PINNEDPOOL"

template<> struct DataTypeTraits<nvinfer1::DataType::kFLOAT>

公共类型

using type = float

公共静态属性

static char constexpr name[] = "float"

static auto constexpr size = sizeof(type)

template<> struct DataTypeTraits<nvinfer1::DataType::kHALF>

公共类型

using type = half

公共静态属性

static char constexpr name[] = "half"

static auto constexpr size = sizeof(类型)

template<> struct DataTypeTraits<nvinfer1::DataType::kINT8>

公共类型

using type = std::int8_t

公共静态属性

static char constexpr name[] = "int8"

static auto constexpr size = sizeof(类型)

template<> struct DataTypeTraits<nvinfer1::DataType::kINT32>

公共类型

using type = std::int32_t

公共静态属性

static char constexpr name[] = "int32"

static auto constexpr size = sizeof(类型)

template<> struct DataTypeTraits<nvinfer1::DataType::kINT64>

公共类型

using type = std::int64_t

公共静态属性

static char constexpr name[] = "int64"

static auto constexpr size = sizeof(type)

template<> struct DataTypeTraits<nvinfer1::DataType::kINT32, true>

公共类型

using type = std::uint32_t

公共静态属性

static char constexpr name[] = "uint32"

static auto constexpr size = sizeof(type)

template<> struct DataTypeTraits<nvinfer1::DataType::kINT64, true>

公共类型

using type = std::uint64_t

公共静态属性

static char constexpr name[] = "uint64"

static auto constexpr size = sizeof(类型)

template<bool kUnsigned> struct DataTypeTraits<nvinfer1::DataType::kBOOL, kUnsigned>

公共类型

using type = bool

公共静态属性

static char constexpr name[] = "bool"

static auto constexpr size = sizeof(type)

template<bool kUnsigned> struct DataTypeTraits<nvinfer1::DataType::kUINT8, kUnsigned>

公共类型

using type = std::uint8_t

公共静态属性

static char constexpr name[] = "uint8"

static auto constexpr size = sizeof(类型)

template<> struct TRTDataType<std::int8_t>

公共静态属性

static constexpr auto value = nvinfer1::DataType::kINT8

template<> struct TRTDataType<std::int32_t>

公共静态属性

static constexpr auto value = nvinfer1::DataType::kINT32

template<> struct TRTDataType<std::uint32_t>

公共静态属性

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}

template<> struct TRTDataType<std::int64_t>

公共静态属性

static constexpr auto value = nvinfer1::DataType::kINT64

template<> struct TRTDataType<std::uint64_t>

公共静态属性

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}

template<> struct TRTDataType<std::uint8_t>

公共静态属性

static constexpr auto value = nvinfer1::DataType::kUINT8

template<> struct TRTDataType<kernels::KVCacheIndex>

公共静态属性

static constexpr auto value = TRTDataType<kernels::KVCacheIndex::UnderlyingType>::value

template<> struct TRTDataType<kernels::FinishedState>

公共静态属性

static constexpr auto value = TRTDataType<kernels::FinishedState::UnderlyingType>::value

template<> struct TRTDataType<runtime::请求类型>

公共静态属性

static constexpr auto value = TRTDataType<std::underlying_type_t<runtime::请求类型>>::value

namespace tensorrt_llm

namespace runtime

类型定义

template<typename T> using PointerElementType = typename std::remove_reference_t<T>::element_type

枚举

enum class MemoryType : std::int32_t

值：

enumerator kGPU

enumerator kCPU

enumerator kPINNED

enumerator kUVM

enumerator kPINNEDPOOL

函数

template<typename T> std::shared_ptr<std::remove_const_t<T>> constPointerCast(std::shared_ptr<T> const &ptr) noexcept

template<typename T, typename D> std::shared_ptr<std::remove_const_t<T>> constPointerCast(std::unique_ptr<T, D> &&ptr) noexcept

template<typename T> T const *bufferCast(IBuffer const &buffer)

获取指向缓冲区常量基础数据的类型化指针。

Template Parameters:: T – 基础数据的类型。
Parameters:: buffer – 要获取指针的缓冲区。
Returns:: 指向常量 T 的指针。

template<typename T> T *bufferCast(IBuffer &buffer)

获取指向缓冲区基础数据的类型化指针。

Template Parameters:: T – 基础数据的类型。
Parameters:: buffer – 要获取指针的缓冲区。
Returns:: 指向 T 的指针。

template<typename T> T *bufferCastOrNull(IBuffer::SharedPtr const &bufferPtr)

检索指向由bufferPtr指向的缓冲区底层数据的T类型指针，如果bufferPtr为空，则返回nullptr。

Template Parameters:: T – 基础数据的类型。
Parameters:: bufferPtr – 一个可能为空的共享指针。
Returns:: 指向T的指针，可能为nullptr。

template<typename T> T const *bufferCastOrNull(IBuffer::SharedConstPtr const &bufferPtr)

检索指向由bufferPtr指向的缓冲区基础数据的T const类型指针，如果bufferPtr为空，则返回nullptr。

Template Parameters:: T – 基础数据的类型。
Parameters:: bufferPtr – 一个可能为空的共享指针。
Returns:: 指向常量T的指针，可能为nullptr。

template<typename T> T *bufferCastOrNull(std::optional<IBuffer::SharedPtr> const &optionalBufferPtr)

检索一个指向optionalBufferPtr中包含的缓冲区指针所指向的底层数据的T类型指针，如果optional没有值，则返回nullptr。

Template Parameters:: T – 基础数据的类型。
Parameters:: optionalBufferPtr – 一个可能为空的选项。
Returns:: 指向T的指针，可能为nullptr。

template<typename T> T const *bufferCastOrNull(std::optional<IBuffer::SharedConstPtr> const &optionalBufferPtr)

检索一个指向包含在optionalBufferPtr中的缓冲区指针所指向的底层数据的T const类型指针，如果optional没有值，则返回nullptr。

Template Parameters:: T – 基础数据的类型。
Parameters:: optionalBufferPtr – 一个可能为空的选项。
Returns:: 指向常量T的指针，可能为nullptr。

std::ostream &operator<<(std::ostream &output, IBuffer const &buffer): 用于打印缓冲区的实用函数。

class BufferDataType

#include <iBuffer.h>

一个围绕nvinfer1::DataType的包装器，提供了对指针类型的支持。

公共函数

inline constexpr BufferDataType(nvinfer1::DataType dataType, bool _unsigned = false, bool pointer = false)

inline constexpr operator nvinfer1::DataType() const noexcept

inline constexpr nvinfer1::DataType getDataType() const noexcept

inline constexpr bool isPointer() const noexcept

inline constexpr bool isUnsigned() const

inline constexpr std::size_t getSize() const noexcept

公共静态属性

static auto constexpr kTrtPointerType = nvinfer1::DataType::kINT64

私有成员

nvinfer1::DataType mDataType

bool mUnsigned

bool mPointer

template<typename T> class BufferRange : public tensorrt_llm::common::ArrayView<T>

公共类型

using Base = tensorrt_llm::common::ArrayView<T>

公共函数

inline BufferRange(T *data, size_type size)

template<typename U = T, std::enable_if_t<!std::is_const_v<U>, bool> = true> inline explicit BufferRange(IBuffer &buffer)

template<typename U = T, std::enable_if_t<std::is_const_v<U>, bool> = true> inline explicit BufferRange(IBuffer const &buffer)

template<nvinfer1::DataType kDataType, bool kIsUnsigned = false, bool kIsPointer = false> struct DataTypeTraits: #include <iBuffer.h>

用于将TensorRT数据类型转换为C++数据类型。

template<nvinfer1::DataType kDataType, bool kUnsigned> struct DataTypeTraits<kDataType, kUnsigned, true>

公共类型

using type = typename DataTypeTraits<kDataType, kUnsigned, false>::type*

公共静态属性

static char constexpr name[] = "*"

static auto constexpr size = sizeof(类型)

template<bool kUnsigned> kBOOL, kUnsigned >

公共类型

using type = bool

公共静态属性

static char constexpr name[] = "bool"

static auto constexpr size = sizeof(类型)

template<> kFLOAT >

公共类型

using type = float

公共静态属性

static char constexpr name[] = "float"

static auto constexpr size = sizeof(类型)

template<> kHALF >

公共类型

using type = half

公共静态属性

static char constexpr name[] = "half"

static auto constexpr size = sizeof(类型)

template<> kINT32 >

公共类型

using type = std::int32_t

公共静态属性

static char constexpr name[] = "int32"

static auto constexpr size = sizeof(类型)

template<> kINT32, true >

公共类型

using type = std::uint32_t

公共静态属性

static char constexpr name[] = "uint32"

static auto constexpr size = sizeof(类型)

template<> kINT64 >

公共类型

using type = std::int64_t

公共静态属性

static char constexpr name[] = "int64"

static auto constexpr size = sizeof(类型)

template<> kINT64, true >

公共类型

using type = std::uint64_t

公共静态属性

static char constexpr name[] = "uint64"

static auto constexpr size = sizeof(类型)

template<> kINT8 >

公共类型

using type = std::int8_t

公共静态属性

static char constexpr name[] = "int8"

static auto constexpr size = sizeof(类型)

template<bool kUnsigned> kUINT8, kUnsigned >

公共类型

using type = std::uint8_t

公共静态属性

static char constexpr name[] = "uint8"

static auto constexpr size = sizeof(类型)

class IBuffer

由 tensorrt_llm::runtime::ITensor 子类化

公共类型

using UniquePtr = std::unique_ptr<IBuffer>

using SharedPtr = std::shared_ptr<IBuffer>

using UniqueConstPtr = std::unique_ptr<IBuffer const>

using SharedConstPtr = std::shared_ptr<IBuffer const>

using DataType = nvinfer1::DataType

公共函数

virtual void *data() = 0: 返回指向底层数组的指针。

virtual void const *data() const = 0: 返回指向底层数组的指针。

inline virtual void *data(std::size_t index): 返回指向给定元素索引处底层数组的指针。

inline virtual void const *data(std::size_t index) const: 返回指向给定元素索引处底层数组的指针。

virtual std::size_t getSize() const = 0: 返回缓冲区的大小（以元素数量计）。

inline virtual std::size_t getSizeInBytes() const: 返回缓冲区的大小（以字节为单位）。

virtual std::size_t getCapacity() const = 0: 返回缓冲区的容量。

virtual 数据类型 getDataType() const = 0: 返回缓冲区的数据类型。

virtual char const *getDataTypeName() const

virtual MemoryType getMemoryType() const = 0: 返回缓冲区的内存类型。

virtual char const *getMemoryTypeName() const

virtual void resize(std::size_t newSize) = 0: 调整缓冲区的大小。如果新大小小于或等于当前容量，则此操作无效。

virtual void release() = 0: 释放缓冲区。它将被重置为 nullptr。

virtual ~IBuffer() = default

IBuffer(IBuffer const&) = delete: 不允许复制。

IBuffer &operator=(IBuffer const&) = delete: 不允许复制。

公共静态函数

static UniquePtr slice(SharedPtr buffer, std::size_t offset, std::size_t size)

在底层的buffer上创建一个切片视图。该视图将具有与buffer相同的数据类型。

Parameters:

buffer – 要查看的缓冲区。
offset – 视图的偏移量。
size – 视图的大小。

Returns:

关于buffer的视图。

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)

static inline UniquePtr slice(SharedPtr buffer, std::size_t offset)

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)

static inline UniquePtr view(SharedPtr tensor)

返回基础tensor的视图，该视图可以独立调整大小。

Parameters:: tensor – 要查看的张量。
Returns:: 关于tensor的视图。

static inline UniquePtr view(SharedPtr tensor, std::size_t size)

返回基础tensor的不同大小的视图。

Parameters:

tensor – 要查看的张量。
size – 视图的大小。

Returns:

关于tensor的视图。

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr view(TConstPtr &&tensor, std::size_t size)

static UniquePtr wrap(void *data, 数据类型 type, std::size_t size, std::size_t capacity)

将给定的data包装在IBuffer中。IBuffer不会拥有底层的data，并且不能调整大小超过capacity。

Parameters:

data – 要包装的数据。
type – data 的数据类型。
size – 缓冲区的大小。
capacity – 缓冲区的容量。

Returns:

一个 IBuffer。

static inline UniquePtr wrap(void *data, 数据类型 type, std::size_t size)

template<typename T> static inline UniquePtr wrap(T *data, std::size_t size, std::size_t capacity)

template<typename T> static inline UniquePtr wrap(T *data, std::size_t size)

template<typename T> static inline UniquePtr wrap(std::vector<T> &v)

static MemoryType memoryType(void const *data): 确定指针的内存类型。

受保护的函数

IBuffer() = default

inline std::size_t toBytes(std::size_t size) const: 返回一个数组索引或字节大小。

template<MemoryType T> struct MemoryTypeString

template<> kCPU >

公共静态属性

static auto constexpr value = "CPU"

template<> kGPU >

公共静态属性

static auto constexpr value = "GPU"

template<> kPINNED >

公共静态属性

static auto constexpr value = "PINNED"

template<> kPINNEDPOOL >

公共静态属性

static auto constexpr value = "PINNEDPOOL"

template<> kUVM >

公共静态属性

static auto constexpr value = "UVM"

template<typename T, bool = false> struct TRTDataType: #include <iBuffer.h>

用于将C++数据类型转换为TensorRT数据类型。

template<> struct TRTDataType<bool>

公共静态属性

static constexpr auto value = nvinfer1::DataType::kBOOL

template<> struct TRTDataType<float>

公共静态属性

static constexpr auto value = nvinfer1::DataType::kFLOAT

template<> struct TRTDataType<half>

公共静态属性

static constexpr auto value = nvinfer1::DataType::kHALF

template<> FinishedState >

公共静态属性

static constexpr auto value = TRT数据类型<kernels::FinishedState::UnderlyingType>::value

template<> KVCacheIndex >

公共静态属性

static constexpr auto value = TRT数据类型<kernels::KVCacheIndex::UnderlyingType>::value

template<> RequestType >

公共静态属性

static constexpr auto value = TRT数据类型<std::underlying_type_t<runtime::请求类型>>::value

template<> int32_t >

公共静态属性

static constexpr auto value = nvinfer1::DataType::kINT32

template<> int64_t >

公共静态属性

static constexpr auto value = nvinfer1::DataType::kINT64

template<> int8_t >

公共静态属性

static constexpr auto value = nvinfer1::DataType::kINT8

template<> uint32_t >

公共静态属性

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}

template<> uint64_t >

公共静态属性

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}

template<> uint8_t >

公共静态属性

static constexpr auto value = nvinfer1::DataType::kUINT8

template<typename T> struct TRTDataType<T*>

公共静态属性

static auto constexpr value = BufferDataType{kUnderlyingType.getDataType(), kUnderlyingType.isUnsigned(), true}

私有静态属性

static auto constexpr kUnderlyingType = BufferDataType{TRTDataType<std::remove_const_t<T>, false>::value}

template<> struct TRTDataType<void*>

公共静态属性

static constexpr auto value = BufferDataType::kTrtPointerType 

iGptDecoderBatched.h

namespace tensorrt_llm

namespace runtime

class IGptDecoderBatched : public virtual tensorrt_llm::runtime::IStatefulGptDecoder 

#include <iGptDecoderBatched.h>

支持飞行中批处理的GPT解码器类。

由tensorrt_llm::runtime::GptDecoderBatched子类化

公共类型

using CudaStreamPtr = std::shared_ptr<CudaStream>

using TensorPtr = std::shared_ptr<ITensor>

using DecoderFinishedEventPtr = std::unique_ptr<decoder_batch::DecoderFinishedEvent const>

公共函数

virtual void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::输入 explicitDraftTokensBuffers) = 0: 为ExplicitDraftTokens解码设置缓冲区。

virtual void setupEagle(EagleBuffers::输入 eagleBuffers) = 0: 为Eagle解码设置缓冲区。

virtual void setupLookahead(前瞻解码缓冲区 lookaheadDecodingBuffers) = 0: 为Lookahead解码设置缓冲区。

virtual DecoderFinishedEventPtr forwardAsync(decoder_batch::输出 &output, decoder_batch::输入 const &input) = 0: 运行所有请求的一步，而不阻塞主机进程，并返回用于同步的令牌。

virtual void forwardSync(decoder_batch::DecoderFinishedEvent const &token, decoder_batch::输出 &output, decoder_batch::输入 const &input) = 0: 调用解码器的forwardSync并等待与令牌关联的forwardAsync调用完成。

virtual void forwardSync(decoder_batch::DecoderFinishedEvent const &token) = 0: 等待与令牌关联的forwardAsync调用完成。

inline virtual void forward(decoder_batch::输出 &output, decoder_batch::输入 const &input): 为主机上的所有请求运行一步并等待完成。

virtual TensorPtr getIds(SizeType32 batchIdx) const = 0

Parameters:: batchIdx – 批次的索引
Returns:: [maxBeamWidth, maxInputLength + maxNewTokens]，包含请求batchIdx的输入令牌ID和生成的令牌ID，没有填充，位于GPU上。

virtual TensorPtr getGatheredIds(SizeType32 batchIdx) const = 0

Returns:: [batchSize, maxBeamWidth, maxInputLength + maxNewTokens]，仅在GptDecoderBatched中的束搜索中使用。它包含在GPU上收集的无填充的令牌ID。

virtual CudaEvent finalize(SizeType32 batchIdx, SamplingConfig const &samplingConfig, bool streaming) const = 0: 为请求 batchIdx 收集最终的波束搜索结果。结果只有在事件返回后才会可用。

virtual std::vector<bool> getFinished() const = 0

Returns:: [batchSize (实际)], 标记完成的请求（每批）

virtual TensorPtr getFinishReasons() const = 0

Returns:: [batchSize, beamWidth], FinishedState 值，在 GPU 上

virtual TensorPtr getCumLogProbs() const = 0

Returns:: [batchSize, beamWidth], 累积对数概率（每束），在GPU上

virtual TensorPtr getCumLogProbs(SizeType32 batchIdx) const = 0

Returns:: [beamWidth], 请求批次 batchIdx 的累积对数概率（每束），在 GPU 上

virtual TensorPtr getLogProbs() const = 0

Returns:: [batchSize, beamWidth, maxSeqLen], 对数概率（每束），在GPU上

virtual TensorPtr getLogProbs(SizeType32 batchIdx) const = 0

Returns:: [beamWidth, maxSeqLen], 请求批次 batchIdx 的累积对数概率（每束），在 GPU 上

virtual TensorPtr getParentIds() const = 0

virtual std::vector<SizeType32> getNbSteps() const = 0

virtual 执行器::解码模式 getDecodingMode() const = 0

virtual void newRequests(std::vector<SizeType32> const &seqSlots, std::vector<decoder_batch::请求> const &requests, std::vector<SamplingConfig> const &samplingConfigs, ModelConfig const &modelConfig) = 0: 在seqSlots处使用新的requests初始化批量解码器。

virtual TensorPtr getNextDraftTokens() const = 0

Returns:: [batchSize, maxTokensPerStep-1], 预测下一步的草稿令牌，在GPU上

virtual TensorPtr getPrevDraftTokensLengths() const = 0

Returns:: [batchSize]，上一步预测的草稿令牌长度，在GPU上

virtual TensorPtr getNextDraftTokensLengths() const = 0

Returns:: [batchSize], 预测下一步的草稿令牌长度，在GPU上

virtual TensorPtr getAcceptedLengthsCumSum() const = 0

Returns:: [batchSize + 1]，接受的草稿令牌长度的独占和，在GPU上

virtual TensorPtr getAcceptedPackedPaths() const = 0

Returns:: [batchSize, maxAcceptedDraftTokensPerStep], 接受的路径打包成连续的张量，在GPU上

受保护的函数

IGptDecoderBatched() = default

namespace decoder_batch

类型定义

using Output = 解码器::输出 

class DecoderFinishedEvent

公共函数

inline explicit DecoderFinishedEvent(CudaEvent &&event, std::vector<bool> const &active)

公共成员

CudaEvent event

std::vector<bool> active

class Input

公共类型

using TensorConstPtr = ITensor::SharedConstPtr 

using TensorPtr = ITensor::SharedPtr 

公共函数

inline explicit Input(std::vector<TensorPtr> const &logits, std::vector<bool> const &active)

inline explicit Input(std::vector<TensorPtr> const &logits)

公共成员

std::vector<TensorPtr> logits

std::vector<bool> active

TensorPtr cacheIndirection

std::vector<std::vector<TensorPtr>> predictedDraftLogits

TensorPtr seqSlots

std::optional<ExplicitDraftTokensBuffers::引擎输出> explicitDraftTokensInputs

std::optional<ExplicitDraftTokensBuffers::EngineInputs> explicitDraftTokensLastInputs

std::optional<EagleBuffers::引擎输出> eagleInputs

std::optional<EagleBuffers::输入> eagleLastInputs

iStatefulGptDecoder.h

namespace tensorrt_llm

namespace batch_manager

namespace runtime

class IStatefulGptDecoder

#include <iStatefulGptDecoder.h>

支持飞行中批处理的GPT解码器类。

由tensorrt_llm::runtime::IGptDecoderBatched子类化

公共类型

using CudaStreamPtr = std::shared_ptr<CudaStream>

using TensorPtr = std::shared_ptr<ITensor>

公共函数

virtual void setup(执行器::解码模式 const &mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, SizeType32 maxTokensPerStep, nvinfer1::DataType dtype, ModelConfig const &modelConfig) = 0: 在调用forward()之前设置解码器，同时调用reshapeBuffers。

virtual void newBatch(生成输入 const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig) = 0: 使用新一批输入初始化解码器。

virtual void forwardAsync(解码器::输出 &output, 解码器::输入 const &input) = 0: 为所有请求运行一步，而不阻塞主线程。

virtual void forwardSync() = 0: 等待最后一次调用 forwardAsync 完成。

inline virtual void forward(解码器::输出 &output, 解码器::输入 const &input): 为所有请求运行一个步骤。

virtual void finalize(SamplingConfig const &samplingConfig) const = 0: 收集所有请求的最终波束搜索结果。

virtual TensorPtr getIds() const = 0

Returns:: [batchSize, beamWidth, maxSequenceLength], 所有令牌ID, 在GPU上

virtual TensorPtr getGatheredIds() const = 0

Returns:: [batchSize, beamWidth, maxSequenceLength] 在 gatherTree 后的 token ids

virtual TensorPtr getCumLogProbs() const = 0

Returns:: [batchSize, maxBeamWidth], 累积对数概率（每束），在GPU上

virtual TensorPtr getLogProbs() const = 0

Returns:: [batchSize, maxBeamWidth, maxSequenceLength], 对数概率（每束），在GPU上

virtual TensorPtr getNewTokens(SizeType32 iter = 0) const = 0

获取在上一次前向传播步骤中生成的令牌。

Parameters:: iter – 在 [0; maxTokensPerStep) 范围内获取令牌的迭代次数
Returns:: [batchSize, beamWidth], 在 iter 中生成的 tokens（每束），在 gpu 上

virtual TensorPtr getAllNewTokens() const = 0

获取在上一次前向传递中生成的maxTokensPerStep令牌。

Returns:: [maxTokensPerStep, batchSize, maxBeamWidth], 上次前向传递生成的tokens，在gpu上

virtual TensorPtr getNbFinished() const = 0

Returns:: [1], 已完成序列的数量，位于固定的主机内存中

virtual ~IStatefulGptDecoder() = default

受保护的函数

IStatefulGptDecoder() = default

namespace decoder

class Input

公共类型

using TensorPtr = ITensor::SharedPtr 

公共函数

inline explicit Input(TensorPtr logits)

公共成员

TensorPtr logits

TensorPtr cacheIndirection

class Output

公共类型

using TensorPtr = std::shared_ptr<ITensor>

公共函数

Output() = default

公共成员

TensorPtr cacheIndirection

TensorPtr sequenceLengths

iTensor.h

namespace nvinfer1

namespace tensorrt_llm

namespace runtime

函数

inline std::ostream &operator<<(std::ostream &output, ITensor::形状 const &dims): 用于打印形状的工具函数。

std::ostream &operator<<(std::ostream &output, ITensor const &tensor): 用于打印张量及其形状的实用函数。

template<typename T> T const *bufferCastOrNull(ITensor::SharedConstPtr const &tensorPtr)

检索一个指向由tensorPtr指向的张量的基础数据的T const类型指针，如果tensorPtr为空，则返回nullptr。

为了避免在涉及隐式转换为IBuffer时产生歧义，必须声明此重载。

Template Parameters:: T – 基础数据的类型。
Parameters:: tensorPtr – 一个可能为空的共享指针。
Returns:: 指向T常量的指针，可能为nullptr。

template<typename T> T *bufferCastOrNull(ITensor::SharedPtr const &tensorPtr)

检索指向由 tensorPtr 指向的缓冲区底层数据的 T 类型指针，如果 tensorPtr 为空，则返回 nullptr。

为了避免在涉及隐式转换为IBuffer时产生歧义，必须声明此重载。

Template Parameters:: T – 基础数据的类型。
Parameters:: tensorPtr – 一个可能为空的共享指针。
Returns:: 指向T的指针，可能为nullptr。

template<typename T> T *bufferCastOrNull(std::optional<ITensor::SharedPtr> const &optionalTensorPtr)

检索一个指向由optionalBufferPtr中包含的张量指针所指向的张量底层数据的T类型指针，如果optional没有值，则返回nullptr。

为了避免在涉及隐式转换为IBuffer时产生歧义，必须声明此重载。

Template Parameters:: T – 基础数据的类型。
Parameters:: optionalBufferPtr – 一个可能为空的选项。
Returns:: 指向T的指针，可能为nullptr。

template<typename T> T const *bufferCastOrNull(std::optional<ITensor::SharedConstPtr> const &optionalTensorPtr)

检索一个指向由optionalBufferPtr中包含的张量指针所指向的张量的基础数据的T const类型指针，如果optional没有值，则返回nullptr。

为了避免在涉及隐式转换为IBuffer时产生歧义，必须声明此重载。

Template Parameters:: T – 基础数据的类型。
Parameters:: optionalBufferPtr – 一个可能为空的选项。
Returns:: 指向常量T的指针，可能为nullptr。

class ITensor : public virtual tensorrt_llm::runtime::IBuffer 

公共类型

using UniquePtr = std::unique_ptr<ITensor>

using SharedPtr = std::shared_ptr<ITensor>

using UniqueConstPtr = std::unique_ptr<ITensor const>

using SharedConstPtr = std::shared_ptr<ITensor const>

using Shape = nvinfer1::Dims

using DimType64 = std::remove_reference_t<decltype(形状::d[0])>

using TensorMap = runtime::StringPtrMap<runtime::ITensor>

公共函数

~ITensor() override = default

virtual 形状 const &getShape() const = 0: 返回张量的维度。

template<SizeType32 n> inline DimType64 getDimension() const: 返回张量的第n维。如果n为负数，则返回第(nbDims - n)维。TODO: 在迁移到C++20时，用constexpr参数替换。

virtual void reshape(形状 const &dims) = 0: 设置张量的维度。张量的新大小将是 volume(dims)

inline virtual void resize(std::size_t newSize) override: 调整缓冲区的大小。如果新大小小于或等于当前容量，则此操作无效。

ITensor(ITensor const&) = delete: 不允许复制。

ITensor &operator=(ITensor const&) = delete: 不允许复制。

inline void squeeze(SizeType32 dim): 从该张量中移除给定的单位维度。

inline void unsqueeze(SizeType32 dim): 在指定位置添加一个单位维度。

inline bool shapeEquals(形状 const &other) const

inline bool shapeEquals(std::initializer_list<SizeType32> const &other) const

template<typename T> inline bool shapeEquals(T const *dims, SizeType32 count) const

公共静态函数

static inline std::int64_t volume(形状 const &dims): 返回维度的体积。如果d.nbDims < 0，则返回-1。

static inline std::size_t volumeNonNegative(形状 const &shape): 返回维度的体积。如果 d.nbDims < 0 则抛出异常。

static inline 形状 strides(形状 const &dims): 返回Shape中每个维度的步幅。

static 形状 squeeze(形状 const &shape, SizeType32 dim)

从shape中移除给定的unit维度。

Parameters:

shape – 要压缩的形状。
dim – 应该被移除（“压缩”）的维度。

Returns:

一个新的形状，没有单位维度。

static 形状 unsqueeze(形状 const &shape, SizeType32 dim)

在指定位置向shape添加一个单位维度。

Parameters:

shape – 要解压的形状。
dim – 应该添加单位维度的维度。

Returns:

添加了单位维度的新形状。

static UniquePtr slice(SharedPtr tensor, std::size_t offset, std::size_t size)

在底层的tensor上创建一个切片视图。该视图将具有与tensor相同的数据类型。

Parameters:

tensor – 要查看的张量。
offset – 视图相对于张量维度0的偏移量。
size – 视图相对于张量维度0的大小。

Returns:

关于buffer的视图。

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)

static inline UniquePtr slice(SharedPtr tensor, std::size_t offset)

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)

static UniquePtr slice(SharedPtr tensor, 形状 const &offsetDims, DimType64 size)

Parameters:

offsetDims – 多维度的偏移量。
tensor – 要查看的张量。
offsetDims – 视图的偏移尺寸。
size – 视图相对于offsetDims中最后一个维度的大小。
offsetDims – 指定所有维度。

Throws:

每当 – 偏移量溢出或最后一个维度的偏移量+大小溢出。

Returns:

当形状为 [size, 其余维度] 或 [size] 时的视图

static inline UniquePtr slice(SharedPtr tensor, std::initializer_list<DimType64> const &offsetDims, DimType64 size)

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, 形状 const &offsetDims, std::size_t size)

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims, std::size_t size)

static inline UniquePtr slice(SharedPtr tensor, 形状 const &offsetDims): 当size省略时，返回最后一个维度的剩余切片。

static inline UniquePtr slice(SharedPtr tensor, std::initializer_list<DimType64> const &offsetDims)

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, 形状 const &offsetDims)

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims)

static inline UniquePtr at(SharedPtr tensor, 形状 const &offsetDims)

Parameters:: offsetDims – 指定所有维度。
Returns:: 仅在该点的块，形状为[其余维度]或[1]时

static inline UniquePtr at(SharedPtr tensor, std::initializer_list<DimType64> const &offsetDims)

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr at(TConstPtr &&tensor, 形状 const &offsetDims)

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline ITensor::UniqueConstPtr at(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims)

static UniquePtr view(IBuffer::SharedPtr buffer, 形状 const &dims)

返回具有给定形状的基础buffer（或张量）的视图。

Parameters:

tensor – 要查看的张量。
shape – 视图的形状。

Returns:

关于tensor的视图。

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr view(TConstPtr &&tensor, 形状 const &dims)

static inline UniquePtr view(SharedPtr tensor)

返回基础tensor上的视图，该视图可以独立地重塑。

Parameters:: tensor – 要查看的张量。
Returns:: 关于tensor的视图。

static inline UniquePtr flattenN(SharedPtr tensor, std::int64_t sliceN = -1)

返回基础tensor的扁平化视图，该视图可以独立重塑。

Parameters:

tensor – 要展平的张量。
sliceN – 在展平后切片前N个元素。-1表示取整个展平的张量。

Returns:

对 tensor 的扁平化视图。

static UniquePtr wrap(void *data, nvinfer1::DataType type, 形状 const &shape, std::size_t capacity)

将给定的data包装在ITensor中。ITensor不会拥有底层的data，并且不能超过capacity进行重塑。

Parameters:

data – 要包装的数据。
type – data 的数据类型。
shape – 张量的形状。
capacity – 缓冲区的容量。

Returns:

一个 ITensor。

static inline UniquePtr wrap(void *data, nvinfer1::DataType type, 形状 const &shape)

template<typename T> static inline UniquePtr wrap(T *data, 形状 const &shape, std::size_t capacity)

template<typename T> static inline UniquePtr wrap(T *data, 形状 const &shape)

template<typename T> static inline UniquePtr wrap(std::vector<T> &v, 形状 const &shape)

static 形状 makeShape(std::initializer_list<DimType64> const &dims): 一个方便的函数，用于创建具有给定维度的张量形状。

static std::string toString(形状 const &dims): 一个方便的函数，用于将张量形状转换为string。

static inline bool shapeEquals(形状 const &lhs, 形状 const &rhs): 一个用于比较形状的便捷函数。

template<typename T> static inline bool shapeEquals(形状 const &lhs, T const *dims, SizeType32 count): 一个用于比较形状的便捷函数。

受保护的函数

ITensor() = default

受保护的静态函数

static inline DimType64 castSize(size_t newSize)

朋友们

friend class ITensorBindings

ipcUtils.h

namespace tensorrt_llm

namespace runtime

函数

void lamportInitializeAll(void *buffer_0, void *buffer_1, void *buffer_2, size_t size)

class AllReduceBuffers

公共类型

using TensorPtr = ITensor::SharedPtr 

公共函数

AllReduceBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxSequenceLength, SizeType32 hiddenSize, BufferManager const &manager, WorldConfig const &worldConfig, bool const fakeBuffers = false)

公共成员

TensorPtr mAllReduceCommPtrs

std::vector<runtime::IpcMemory> mIpcMemoryHandles

class IpcMemory

公共类型

using BufferPtr = IBuffer::SharedPtr 

公共函数

IpcMemory(std::size_t bufferSize, BufferManager const &manager, WorldConfig const &worldConfig, bool openIpc = true)

~IpcMemory()

IpcMemory(IpcMemory const&) = delete

IpcMemory &operator=(IpcMemory const&) = delete

IpcMemory(IpcMemory&&) = default

IpcMemory &operator=(IpcMemory&&) = default

inline std::vector<void*> const &getCommPtrs() const

公共静态属性

static size_t constexpr FLAGS_SIZE = (tensorrt_llm::kernels::MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t)

私有函数

void allocateIpcMemory(std::size_t bufferSize, BufferManager const &manager, WorldConfig const &worldConfig)

void destroyIpcMemory()

私有成员

SizeType32 mTpRank

std::vector<void*> mCommPtrs

BufferPtr mBuffer

bool mOpenIpc

lookaheadBuffers.h

namespace tensorrt_llm

namespace runtime

class LookaheadDecodingBuffers

公共类型

using SizeType32 = runtime::SizeType32 

using TensorPtr = runtime::ITensor::SharedPtr 

using ITensor = tensorrt_llm::runtime::ITensor 

公共函数

LookaheadDecodingBuffers(SizeType32 maxNumSequences, SizeType32 maxTokensPerStep, runtime::BufferManager const &bufferManager)

公共成员

TensorPtr generationLengths

TensorPtr positionOffsets

TensorPtr packedMasks

TensorPtr positionIds

class LookaheadRuntimeBuffers

公共类型

using SizeType32 = tensorrt_llm::runtime::SizeType32 

using ITensor = tensorrt_llm::runtime::ITensor 

using TensorPtr = runtime::ITensor::SharedPtr 

using TensorMap = runtime::StringPtrMap<runtime::ITensor>

公共函数

LookaheadRuntimeBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, runtime::BufferManager const &manager, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig, 执行器::DecodingConfig const &decodingConfig, runtime::TllmRuntime const &runtime)

void setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ITensor const &requestTypes, ITensor const &seqSlots, 前瞻解码缓冲区 const &decoderLookaheadBuffers, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig) const

void reshape(SizeType32 numCtxSequences, SizeType32 numGenSequences, SizeType32 tokensPerStep)

void insertInputTensors(TensorMap &inputBuffers, TensorMap &outputBuffers, runtime::WorldConfig const &worldConfig) const

公共成员

TensorPtr cumSumLength

TensorPtr packedMasksDevice

TensorPtr generationLengthsDevice

TensorPtr positionOffsetsDevice

TensorPtr positionIdsDevice

TensorPtr packedMaskHost

TensorPtr generationLengthsHost

TensorPtr positionOffsetsHost

TensorPtr positionIdsHost

TensorPtr packedMaskHostCopy

TensorPtr generationLengthsHostCopy

TensorPtr positionOffsetsHostCopy

TensorPtr positionIdsHostCopy

TensorPtr batchSlotsHostCopy

lookaheadModule.h

namespace tensorrt_llm

namespace runtime

class LookaheadModule : public tensorrt_llm::runtime::SpeculativeDecodingModule 

公共函数

inline explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept

inline explicit LookaheadModule() noexcept

inline void setExecutionConfig(执行器::LookaheadDecodingConfig const &config)

inline 执行器::LookaheadDecodingConfig const getExecutionConfig() const

私有成员

执行器::LookaheadDecodingConfig mExecutionConfig

loraCache.h

namespace tensorrt_llm

namespace runtime

函数

std::string to_string(LoraCache::任务层模块配置 const &v)

std::ostream &operator<<(std::ostream &os, LoraCache::任务层模块配置 const &v)

class LoraCache

#include <loraCache.h>

LoraCache

使用LRU淘汰策略缓存LoRA权重。

放入缓存的任务会被标记为进行中，并且在标记为完成之前不能被移除。

缓存页面保存了一个最佳大小的LoRA。一个页面的大小为[numSlots x pageWidth]。最佳大小的LoRA是具有配置的optimalAdapterSize的LoRA。

从概念上讲，一个槽对应于一组r=1、1层、1模块的输入/输出权重。页面宽度设置为最小模块中的权重数量。

每页的插槽数量为 ceilDiv(最优大小 LoRA 中的权重数量, 最小模块中的权重数量)

缓存页面分配在一个或多个块上

公共类型

using TensorPtr = ITensor::SharedPtr 

using TaskIdType = std::uint64_t

using TaskLayerModuleConfigListPtr = std::shared_ptr<std::vector<任务层模块配置>>

公共函数

LoraCache(LoraCachePageManagerConfig const &pageManagerConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, BufferManager const &bufferManager): param[in] pageManagerConfig: 一个 LoraCachePageManagerConfig param[in] modelConfig: 一个 ModelConfig param[in] worldConfig: 一个 WorldConfig param[in] bufferManager: 一个 BufferManager 仅用于分配页面块

void put(TaskIdType taskId, TensorPtr weights, TensorPtr config, bool load = true)

将任务放入缓存中，并为其声明页面，并可选择加载任务权重。

Parameters:

taskId – [in] 任务ID
weights – [in] lora 权重张量
config – [in] lora 配置张量
load – [in] 如果为true，则在返回前加载权重，否则不加载

void loadWeights(TaskIdType taskId, TensorPtr weights, TensorPtr config)

加载任务权重。此方法必须在put之后调用。它设计为在put返回后异步调用，load = false

Parameters:

taslId – [in] 任务ID
weights – [in] lora 权重张量
config – [in] lora 配置张量

inline bool isLoaded(TaskIdType taskId) const

Parameters:: taskId – [in] 任务ID
Returns:: — 如果任务已加载（权重已就位）则为 true，否则为 false

bool isDone(TaskIdType taskId) const

Parameters:: taskId – [in] 任务ID
Returns:: — 如果任务标记为完成并且可以被驱逐，则为真

inline bool has(TaskIdType taskId) const

Parameters:: taskId – [in] 任务ID
Returns:: — 如果任务在缓存中（不一定已加载）则为 true，否则为 false

std::vector<任务层模块配置> const &get(TaskIdType taskId)

Parameters:: taskId – [in] 任务ID
Returns:: — 包含指向任务权重的Value对象列表

void bump(TaskIdType taskId)

提升任务并使其成为最近使用的任务

Parameters:: taskId – [in] 任务ID

void markTaskDone(TaskIdType taskId)

标记任务完成意味着它可以被移除

Parameters:: taskId – [in] 任务ID

void markAllDone(): 将缓存中的所有任务标记为完成

SizeType32 determineNumPages(TaskIdType taskId) const

Parameters:: taskId – [in] 任务ID
Returns:: — 存储给定任务所需的页数

SizeType32 determineNumPages(TensorPtr config) const

Parameters:: config – [in] lora 配置张量
Returns:: — 存储配置为config tensor的任务所需的页数

bool fits(TensorPtr config) const

Parameters:: config – [in] 一个lora配置张量
Returns:: — 如果任务适合缓存则为真，否则为假

void copyTask(TaskIdType taskId, LoraCache &deviceCache, bool markDone = false)

将任务复制到另一个缓存。缓存必须具有相同的页面大小。

Parameters:

taskId – [in] 要复制的任务ID
otherCache – [in] 将任务移动到的 LoraCache
markDone – [in] 在复制任务时将其标记为已完成

SizeType32 getNumPages() const

Returns:: — 分配给缓存的总页数（无论是否使用）

ITensor::SharedConstPtr getPagePtr(size_t pageId) const

Parameters:: pageId – [in] 页面ID
Returns:: — 指向页面的常量指针

公共静态函数

static std::vector<LoraCache::任务层模块配置> copyToPages(TensorPtr weights, TensorPtr config, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::unordered_map<SizeType32, LoraModule> moduleIdToModel, BufferManager const &manager, std::vector<TensorPtr> const &pages, std::vector<std::size_t> const &pageIds)

将任务权重复制到缓存页面。

Parameters:

weights – [in] 任务权重
config – [in] 任务配置张量
modelConfig – [输入] 一个 ModelConfig
worldConfig – [in] 一个 WorldConfig
modelIdToModel – [in] 从lora模块ID到LoraModule的映射
manager – [in] 一个 BufferManager 用于执行复制的管理器
pages – [out] 要复制权重的页面张量列表
pageIds – [in] 页面的页面ID

Returns:

— 缓存值对象列表

static void splitTransposeCpu(ITensor &output, ITensor const &input, SizeType32 tpSize, SizeType32 tpRank)

将输入的第二维度分割为tpSize部分，并将tpRank分割写入输出

Parameters:

output – [out] 输出张量
input – [in] 输入张量
tpSize – [in] 分割数量
tpRank – [in] 要写入输出的分割

私有类型

enum ValueStatus

值：

enumerator kVALUE_STATUS_MISSING

enumerator kVALUE_STATUS_PROCESSING

enumerator kVALUE_STATUS_LOADED

using TaskValuePtr = std::shared_ptr<任务值>

私有函数

void loadWeights(任务值 &cacheValue, TensorPtr weights, TensorPtr config)

void bumpTaskInProgress(TaskIdType taskId)

ValueStatus getStatus(TaskIdType taskId) const

std::vector<std::size_t> claimPagesWithEvict(SizeType32 numPages)

声明 numPages，如果需要则驱逐任务

Parameters:: numPages – [in] 要申请的页数
Throws:: std::runtime_error – 如果无法声明所有页面
Returns:: — 页面ID列表

std::map<size_t, std::pair<size_t, SizeType32>> copyTaskMapPages(任务值 &targetTaskValue, 任务值 const &sourceTaskValue, std::vector<size_t> const &targetPageIds, LoraCache const &targetCache): 内部辅助方法，用于copyTask内部。本身不是线程安全的

私有成员

LoraCachePageManagerConfig mPageManagerConfig

ModelConfig mModelConfig

WorldConfig mWorldConfig

mutable std::mutex mPagesMutex

std::unique_ptr<LoraCachePageManager> mCachePageManager

mutable std::mutex mCacheMutex

std::unordered_map<TaskIdType, TaskValuePtr> mCacheMap

std::list<TaskIdType> mInProgressTasks

std::list<TaskIdType> mDoneTasks

std::vector<std::unique_ptr<BufferManager>> mDeviceBufferManagers

std::unique_ptr<BufferManager> mBufferManager

std::unordered_map<SizeType32, LoraModule> mModuleIdToModule

私有静态函数

template<typename T> static void splitTransposeCpuInner(ITensor &output, ITensor const &input, SizeType32 tpSize, SizeType32 tpRank)

struct TaskLayerModuleConfig

#include <loraCache.h>

包含有关单个层/模块的信息。这些配置的列表与每个任务相关联，并可用于填充运行时张量。

公共函数

std::string toString() const

bool operator==(LoraCache::任务层模块配置 const &o) const

公共成员

std::size_t pageId

SizeType32 slotIdx

SizeType32 inSize

SizeType32 outSize

SizeType32 moduleId

SizeType32 layerId

SizeType32 adapterSize

SizeType32 numSlots

std::int64_t weightsInPointer

std::int64_t weightsOutPointer

朋友们

friend class TaskLayerModuleConfigBindings

struct TaskValue

保存单个任务的配置和状态。

公共函数

TaskValue() = delete

~TaskValue() = default

inline TaskValue(std::vector<std::size_t> const &pageIds, TaskLayerModuleConfigListPtr const &configs, std::list<TaskIdType>::iterator it, bool inProgress, bool loaded, bool done, bool loadInProgress = false)

inline TaskValue(任务值 &&o) noexcept

inline 任务值 &operator=(任务值 &&o)

公共成员

std::vector<std::size_t> pageIds

TaskLayerModuleConfigListPtr configs

std::list<TaskIdType>::iterator it

bool inProgress

bool loaded

bool done: 标记任务为已完成。这用于在加载过程中将任务标记为已完成。如果在加载结束时（put、loadweights 或 copyTask 结束时）done=true，任务将被标记为已完成。

bool loadInProgress: 表示权重正在加载，无论是在输入中还是在loadWeights中。这用于阻止对同一任务的并发loadWeights调用。

class LoraCacheFullException : public tensorrt_llm::runtime::LoraExpectedException 

公共函数

explicit LoraCacheFullException(std::string const &msg)

~LoraCacheFullException() noexcept override

class LoraCachePageManager

#include <loraCache.h>

保存lora缓存页面的内存，并管理整个页面的分配和释放。内存是预先在主机或设备上分配的。

请注意，这个类不是线程安全的

公共类型

using TensorPtr = ITensor::SharedPtr 

公共函数

LoraCachePageManager(LoraCachePageManagerConfig const &config, BufferManager const &bufferManager)

Parameters:

config – [输入] 一个 LoraCachePageManagerConfig
bufferManager – [in] 用于分配页面块的Buffermanager

std::optional<std::vector<std::size_t>> claimPages(SizeType32 numPages)

索赔页面

Parameters:: numPages – [in] 要申请的页数
Returns:: 一个元组，其中第一个值是一个布尔值，表示页面是否被声明。如果第一个值为真，第二个值将包含一个页面ID列表。

SizeType32 numAvailablePages() const

获取管理器中可用（空闲）页面的数量

Returns:: 管理器中空闲页面的数量

void releasePages(std::vector<std::size_t> const &pages)

释放给定的页面

Parameters:: pages – [in] 要释放（释放）的页面列表

ITensor::SharedConstPtr blockPtr(SizeType32 blockIdx) const

返回指向给定页面块的指针

Parameters:: blockIdx; – [输入]
Returns:: — 指向页面块的指针

ITensor::SharedConstPtr pagePtr(std::size_t pageIdx) const

返回指向给定页面的指针

Parameters:: pageIdx – [输入]
Returns:: — 指向页面的常量指针

ITensor::SharedPtr mutablePagePtr(std::size_t pageIdx)

返回指向给定页面的指针

Parameters:: pageIdx – [输入]
Returns:: — 可变的页面指针

私有函数

void initialize(BufferManager const &bufferManager)

私有成员

std::vector<TensorPtr> mPageBlocks

std::deque<std::size_t> mFreePageIds

std::vector<std::uint8_t> mIsPageFree

LoraCachePageManagerConfig const mConfig

class LoraExpectedException : public std::runtime_error

由 tensorrt_llm::runtime::LoraCacheFullException 子类化

公共函数

explicit LoraExpectedException(std::string const &msg)

~LoraExpectedException() noexcept override

loraCachePageManagerConfig.h

namespace tensorrt_llm

namespace runtime

函数

inline std::ostream &operator<<(std::ostream &os, LoraCachePageManagerConfig const &c)

inline std::string to_string(LoraCachePageManagerConfig const &c)

class LoraCachePageManagerConfig

#include <loraCachePageManagerConfig.h>

LoraCachePageManager的配置

请参阅LoraCache文档以了解页面、插槽和页面块的描述。

公共函数

inline explicit constexpr LoraCachePageManagerConfig(runtime::MemoryType memType, nvinfer1::DataType dType, SizeType32 totalNumPages, SizeType32 maxPagesPerBlock, SizeType32 slotsPerPage, SizeType32 pageWidth, SizeType32 numCopyStreams)

inline runtime::MemoryType constexpr getMemoryType() const noexcept

inline void constexpr setMemoryType(runtime::MemoryType const &memoryType) noexcept

inline nvinfer1::DataType constexpr getDataType() const noexcept

inline void constexpr setDataType(nvinfer1::DataType const &dtype) noexcept

inline SizeType32 constexpr getTotalNumPages() const noexcept

inline void constexpr setTotalNumPage(SizeType32 const &totalNumPages) noexcept

inline SizeType32 constexpr getMaxPagesPerBlock() const noexcept

inline void constexpr setMaxPagesPerBlock(SizeType32 const &maxPagesPerBlock) noexcept

inline SizeType32 constexpr getSlotsPerPage() const noexcept

inline void constexpr setSlotsPerPage(SizeType32 const &slotsPerPage) noexcept

inline SizeType32 constexpr getPageWidth() const noexcept

inline void constexpr setPageWidth(SizeType32 const &pageWidth) noexcept

inline bool constexpr getInitToZero() const noexcept

inline void constexpr setInitToZero(bool initToZero) noexcept

inline SizeType32 constexpr getNumCopyStreams() const noexcept

inline void constexpr setNumCopyStreams(SizeType32 numCopyStreams) noexcept

私有成员

runtime::MemoryType mMemoryType

nvinfer1::DataType mDataType

SizeType32 mTotalNumPages

SizeType32 mMaxPagesPerBlock

SizeType32 mSlotsPerPage

SizeType32 mPageWidth

SizeType32 mNumCopyStreams = 1

bool mInitToZero

loraModule.h

namespace tensorrt_llm

namespace runtime

函数

inline std::ostream &operator<<(std::ostream &output, LoraModule const &module)

class LoraModule

公共类型

enum class ModuleType : SizeType32 

值：

enumerator kINVALID

enumerator kATTN_QKV

enumerator kATTN_Q

enumerator kATTN_K

enumerator kATTN_V

enumerator kATTN_DENSE

enumerator kMLP_H_TO_4H

enumerator kMLP_4H_TO_H

enumerator kMLP_GATE

enumerator kCROSS_ATTN_QKV

enumerator kCROSS_ATTN_Q

enumerator kCROSS_ATTN_K

enumerator kCROSS_ATTN_V

enumerator kCROSS_ATTN_DENSE

enumerator kMOE_H_TO_4H

enumerator kMOE_4H_TO_H

enumerator kMOE_GATE

enumerator kMOE_ROUTER

enumerator kMLP_ROUTER

using TensorPtr = ITensor::SharedPtr 

公共函数

inline explicit constexpr LoraModule(模块类型 const &t, SizeType32 inDim, SizeType32 outDim, bool inDimFirst, bool outDimFirst, SizeType32 inTpSplitDim, SizeType32 outTpSplitDim) noexcept

inline explicit constexpr LoraModule() noexcept

explicit constexpr LoraModule(LoraModule const &o) = default

constexpr LoraModule &operator=(LoraModule const &o) = default

inline SizeType32 constexpr flattenedInOutSize(SizeType32 adapterSize) const noexcept

inline SizeType32 constexpr inSize(SizeType32 adapterSize) const noexcept

inline SizeType32 constexpr outSize(SizeType32 adapterSize) const noexcept

inline SizeType32 constexpr localInSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept

inline SizeType32 constexpr localOutSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept

inline SizeType32 constexpr localInDim(SizeType32 tpSize) const noexcept

inline SizeType32 constexpr localOutDim(SizeType32 tpSize) const noexcept

inline SizeType32 constexpr localInAdapterSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept

inline SizeType32 constexpr localOutAdapterSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept

inline SizeType32 constexpr localInOutSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept

inline SizeType32 constexpr value() const noexcept

inline std::string_view constexpr name() const noexcept

inline SizeType32 constexpr inDim() const noexcept

inline SizeType32 constexpr outDim() const noexcept

inline bool constexpr inDimFirst() const noexcept

inline bool constexpr outDimFirst() const noexcept

inline SizeType32 constexpr inTpSplitDim() const noexcept

inline SizeType32 constexpr outTpSplitDim() const noexcept

公共静态函数

static std::vector<LoraModule> createLoraModules(std::vector<std::string> const &loraModuleNames, SizeType32 hiddenSize, SizeType32 mlpHiddenSize, SizeType32 numAttentionHeads, SizeType32 numKvAttentionHeads, SizeType32 attentionHeadSize, SizeType32 tpSize, SizeType32 numExperts)

static inline 模块类型 constexpr toModuleType(std::string_view const &name)

static inline std::string_view constexpr toModuleName(模块类型 t) noexcept

static inline std::string_view constexpr toModuleName(SizeType32 id)

私有成员

模块类型 mType

SizeType32 mInDim

SizeType32 mOutDim

bool mInDimFirst

bool mOutDimFirst

SizeType32 mInTpSplitDim

SizeType32 mOutTpSplitDim

medusaModule.h

namespace tensorrt_llm

namespace runtime

class MedusaModule : public tensorrt_llm::runtime::SpeculativeDecodingModule 

公共类型

using TensorPtr = ITensor::SharedPtr 

using MedusaChoices = std::vector<std::vector<SizeType32>>

公共函数

inline explicit MedusaModule(SizeType32 maxAcceptedTokens, SizeType32 maxDraftTokens) noexcept

inline explicit MedusaModule() noexcept

inline MedusaChoices const &getMedusaChoices() const noexcept

私有成员

MedusaChoices mDefaultMedusaChoices = {{0}, {0, 0}, {1}, {0, 1}, {2}, {0, 0, 0}, {1, 0}, {0, 2}, {3}, {0, 3}, {4}, {0, 4}, {2, 0}, {0, 5}, {0, 0, 1}, {5}, {0, 6}, {6}, {0, 7}, {0, 1, 0}, {1, 1}, {7}, {0, 8}, {0, 0, 2}, {3, 0}, {0, 9}, {8}, {9}, {1, 0, 0}, {0, 2, 0}, {1, 2}, {0, 0, 3}, {4, 0}, {2, 1}, {0, 0, 4}, {0, 0, 5}, {0, 0, 0, 0}, {0, 1, 1}, {0, 0, 6}, {0, 3, 0}, {5, 0}, {1, 3}, {0, 0, 7}, {0, 0, 8}, {0, 0, 9}, {6, 0}, {0, 4, 0}, {1, 4}, {7, 0}, {0, 1, 2}, {2, 0, 0}, {3, 1}, {2, 2}, {8, 0}, {0, 5, 0}, {1, 5}, {1, 0, 1}, {0, 2, 1}, {9, 0}, {0, 6, 0}, {0, 0, 0, 1}, {1, 6}, {0, 7, 0}}

memoryCounters.h

namespace tensorrt_llm

namespace runtime

class MemoryCounters

公共类型

using SizeType32 = std::size_t

using DiffType = std::ptrdiff_t

公共函数

MemoryCounters() = default

inline SizeType32 getGpu() const

inline SizeType32 getCpu() const

inline SizeType32 getPinned() const

inline SizeType32 getUVM() const

inline SizeType32 getPinnedPool() const

inline DiffType getGpuDiff() const

inline DiffType getCpuDiff() const

inline DiffType getPinnedDiff() const

inline DiffType getUVMDiff() const

inline DiffType getPinnedPoolDiff() const

template<MemoryType T> inline void allocate(SizeType32 size)

void allocate(MemoryType memoryType, SizeType32 size)

template<MemoryType T> inline void deallocate(SizeType32 size)

void deallocate(MemoryType memoryType, SizeType32 size)

std::string toString() const

公共静态函数

static 内存计数器 &getInstance()

static std::string bytesToString(SizeType32 bytes, int precision = 2)

static std::string bytesToString(DiffType bytes, int precision = 2)

私有成员

std::atomic<SizeType32> mGpu = {}

std::atomic<SizeType32> mCpu = {}

std::atomic<SizeType32> mPinned = {}

std::atomic<SizeType32> mUVM = {}

std::atomic<SizeType32> mPinnedPool = {}

std::atomic<DiffType> mGpuDiff = {}

std::atomic<DiffType> mCpuDiff = {}

std::atomic<DiffType> mPinnedDiff = {}

std::atomic<DiffType> mUVMDiff = {}

std::atomic<DiffType> mPinnedPoolDiff = {}

modelConfig.h

namespace tensorrt_llm

namespace runtime

class ModelConfig

公共类型

enum class ModelVariant : std::int32_t

值：

enumerator kGpt

enumerator kChatGlm

enumerator kGlm

enumerator kMamba

enumerator kRecurrentGemma

enumerator kEncDec

enum class LayerType : std::int32_t

值：

enumerator kATTENTION

enumerator kRECURRENT

enumerator kLINEAR

enumerator kNOOP

enum class KVCacheType : std::int32_t

值：

enumerator kCONTINUOUS

enumerator kPAGED

enumerator kDISABLED

enum class ManageWeightsType : std::int32_t

值：

enumerator kDisabled

enumerator kEnabled

公共函数

inline explicit ModelConfig(SizeType32 vocabSize, SizeType32 nbLayers, SizeType32 nbAttentionLayers, SizeType32 nbRnnLayers, SizeType32 nbHeads, SizeType32 hiddenSize, nvinfer1::DataType dtype)

inline SizeType32 constexpr getVocabSize() const noexcept

inline SizeType32 constexpr getVocabSizePadded(SizeType32 worldSize) const noexcept

inline SizeType32 countLocalLayers(LayerType layerType, SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const

inline SizeType32 countLowerRankLayers(LayerType layerType, SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const

inline SizeType32 getNbLayers(SizeType32 pipelineParallelism = 1) const

inline SizeType32 getNbAttentionLayers(SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const

inline SizeType32 getNbRnnLayers(SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const

inline SizeType32 constexpr getNbHeads() const noexcept

inline SizeType32 getNbKvHeads(SizeType32 layerIdx) const

inline void setNbKvHeads(SizeType32 nbKvHeads)

inline void setNbCrossKvHeads(SizeType32 nbKvHeads)

inline SizeType32 constexpr getHiddenSize() const noexcept

inline SizeType32 constexpr getEncoderHiddenSize() const noexcept

inline void constexpr setEncoderHiddenSize(SizeType32 encoderHiddenSize) noexcept

inline SizeType32 constexpr getSizePerHead() const noexcept

inline void constexpr setSizePerHead(SizeType32 sizePerHead) noexcept

inline nvinfer1::DataType constexpr getDataType() const noexcept

inline bool constexpr useGptAttentionPlugin() const noexcept

inline void constexpr useGptAttentionPlugin(bool useGptAttentionPlugin) noexcept

inline bool constexpr useMambaConv1dPlugin() const noexcept

inline void constexpr useMambaConv1dPlugin(bool useMambaConv1dPlugin) noexcept

inline bool constexpr usePackedInput() const noexcept

inline void constexpr usePackedInput(bool inputPacked) noexcept

inline bool constexpr usePagedState() const noexcept

inline void constexpr usePagedState(bool pagedState) noexcept

inline SizeType32 constexpr getTokensPerBlock() const noexcept

inline void constexpr setTokensPerBlock(SizeType32 TokensPerBlock) noexcept

inline common::QuantMode constexpr getQuantMode() const noexcept

inline void constexpr setQuantMode(common::QuantMode QuantMode) noexcept

inline bool constexpr supportsInflightBatching() const noexcept

inline SizeType32 constexpr getMaxBatchSize() const noexcept

inline void constexpr setMaxBatchSize(SizeType32 maxBatchSize) noexcept

inline SizeType32 constexpr getMaxBeamWidth() const noexcept

inline void constexpr setMaxBeamWidth(SizeType32 maxBeamWidth) noexcept

inline SizeType32 constexpr getMaxInputLen() const noexcept

inline void constexpr setMaxInputLen(SizeType32 maxInputLen) noexcept

inline SizeType32 constexpr getMaxSequenceLen() const noexcept

inline void constexpr setMaxSequenceLen(SizeType32 maxSequenceLen) noexcept

inline std::optional<SizeType32> constexpr getMaxNumTokens() const noexcept

inline void constexpr setMaxNumTokens(std::optional<SizeType32> maxNumTokens) noexcept

inline SizeType32 constexpr getMaxEncoderLen() const noexcept

inline void constexpr setMaxEncoderLen(SizeType32 maxEncoderLen) noexcept

inline bool constexpr usePromptTuning() const noexcept

inline bool constexpr useMrope() const noexcept

inline void constexpr setUseMrope(bool useMrope) noexcept

inline SizeType32 constexpr getMaxPositionEmbeddings() const noexcept

inline void constexpr setMaxPositionEmbeddings(SizeType32 maxPositionEmbeddings) noexcept

inline SizeType32 constexpr getRotaryEmbeddingDim() const noexcept

inline void constexpr setRotaryEmbeddingDim(SizeType32 rotaryEmbeddingDim) noexcept

inline SizeType32 constexpr getMaxPromptEmbeddingTableSize() const noexcept

inline void constexpr setMaxPromptEmbeddingTableSize(SizeType32 maxPromptEmbeddingTableSize) noexcept

inline bool constexpr computeContextLogits() const noexcept

inline void constexpr computeContextLogits(bool computeContextLogits) noexcept

inline bool constexpr computeGenerationLogits() const noexcept

inline void constexpr computeGenerationLogits(bool computeGenerationLogits) noexcept

inline ModelVariant getModelVariant() const

inline void setModelVariant(ModelVariant modelVariant)

inline SizeType32 getMaxDecodingDraftTokens() const

inline SizeType32 constexpr getMaxDecodingTokens() const noexcept

inline void constexpr setContextFMHA(bool contextFMHA) noexcept

inline bool constexpr getContextFMHA() const noexcept

inline void constexpr setPagedContextFMHA(bool pagedContextFMHA) noexcept

inline bool constexpr getPagedContextFMHA() const noexcept

inline void constexpr setPpReduceScatter(bool ppReduceScatter) noexcept

inline bool constexpr getPpReduceScatter() const noexcept

inline bool constexpr useLoraPlugin() const noexcept

inline void constexpr useLoraPlugin(bool useLoraPlugin) noexcept

inline std::vector<LoraModule> const &getLoraModules() const noexcept

inline void setLoraModules(std::vector<LoraModule> const &loraModules) noexcept

inline SizeType32 constexpr getMlpHiddenSize() const noexcept

inline void constexpr setMlpHiddenSize(SizeType32 mlpHiddenSize) noexcept

inline bool constexpr isKVCacheEnabled() const noexcept

inline bool constexpr isPagedKVCache() const noexcept

inline bool constexpr isContinuousKVCache() const noexcept

inline KVCacheType constexpr getKVCacheType() const noexcept

inline void constexpr setKVCacheType(KVCacheType kvCacheType) noexcept

inline bool constexpr useCrossAttention() const noexcept

inline void constexpr setUseCrossAttention(bool useCrossAttention) noexcept

inline bool constexpr usePositionEmbedding() const noexcept

inline void constexpr setUsePositionEmbedding(bool usePositionEmbedding) noexcept

inline bool constexpr useTokenTypeEmbedding() const noexcept

inline void constexpr setUseTokenTypeEmbedding(bool useTokenTypeEmbedding) noexcept

inline SizeType32 constexpr getMaxLoraRank() const noexcept

inline void constexpr setMaxLoraRank(SizeType32 maxLoraRank) noexcept

inline void setSpeculativeDecodingMode(SpeculativeDecodingMode mode) noexcept

inline bool hasSpeculativeDecodingModule() const noexcept

inline SpeculativeDecodingModule const &getSpeculativeDecodingModule() const noexcept

inline std::shared_ptr<SpeculativeDecodingModule const> getSpeculativeDecodingModulePtr() const noexcept

inline std::shared_ptr<SpeculativeDecodingModule> getSpeculativeDecodingModulePtr() noexcept

inline void setSpeculativeDecodingModule(std::shared_ptr<SpeculativeDecodingModule> const &speculativeDecodingModule) noexcept

inline nvinfer1::DataType getKvDataType() const noexcept

inline bool constexpr isTransformerBased() const noexcept

inline bool hasRnnConfig() const noexcept

inline std::optional<RnnConfig> getRnnConfig() const noexcept

inline void setRnnConfig(RnnConfig const &rnnConfig) noexcept

inline bool constexpr isRnnBased() const noexcept

inline std::vector<LayerType> const &getLayerTypes() const noexcept

inline void setLayerTypes(std::vector<LayerType> const &layerTypes) noexcept

inline SpeculativeDecodingMode constexpr getSpeculativeDecodingMode() const noexcept

inline void setLogitsDtype(nvinfer1::DataType inputDtype) noexcept

inline nvinfer1::DataType constexpr getLogitsDtype() const noexcept

inline void setUseShapeInference(bool useShapeInference) noexcept

inline bool useShapeInference() const noexcept

inline ManageWeightsType getManageWeightsType() const noexcept

inline void setManageWeightsType(const ManageWeightsType manageWeightType) noexcept

inline std::string const &getModelName() const noexcept

inline void setModelName(std::string const &modelName)

inline std::vector<SizeType32> const &getNumKvHeadsPerLayer() const

inline std::pair<std::vector<SizeType32>::const_iterator, std::vector<SizeType32>::const_iterator> getNumKvHeadsPerLayerLocalRange(SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0, bool isCrossAttention = false) const

inline void setNumKvHeadsPerLayer(std::vector<SizeType32> const &headsPerLayer)

inline void setNumKvHeadsPerCrossLayer(std::vector<SizeType32> const &headsPerLayer)

inline SizeType32 getSumLocalKvHeads(SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0, bool isCrossAttention = false) const

inline bool constexpr skipCrossAttnBlocks() const noexcept

inline void constexpr setSkipCrossAttnBlocks(bool skipCrossAttnBlocks) noexcept

公共静态函数

static inline KVCacheType KVCacheTypeFromString(std::string value)

static inline std::vector<SizeType32> getOptProfilesSplitPoints() noexcept

公共静态属性

static constexpr std::array kOPT_PROFILES_SPLIT_POINTS = {64, 128, 256, 512, 1024}

static constexpr SizeType32 kDEFAULT_NUM_TOKENS_PER_BLOCK = 64

私有成员

SizeType32 mVocabSize

SizeType32 mNbLayers

SizeType32 mNbAttentionLayers

SizeType32 mNbRnnLayers

SizeType32 mNbHeads

SizeType32 mHiddenSize

SizeType32 mSizePerHead

nvinfer1::DataType mDataType

bool mUseGptAttentionPlugin

bool mUseMambaConv1dPlugin

bool mInputPacked

bool mPagedState

SizeType32 mTokensPerBlock

common::QuantMode mQuantMode

SizeType32 mMaxBatchSize

SizeType32 mMaxBeamWidth

SizeType32 mMaxInputLen

SizeType32 mMaxSequenceLen

std::optional<SizeType32> mMaxNumTokens

bool mComputeContextLogits

bool mComputeGenerationLogits

ModelVariant mModelVariant

SizeType32 mMaxPromptEmbeddingTableSize

bool mUseMrope

SizeType32 mMaxPositionEmbeddings

SizeType32 mRotaryEmbeddingDim

bool mContextFMHA

bool mPagedContextFMHA

bool mUseXQA

bool mPpReduceScatter

bool mUseLoraPlugin

std::vector<LoraModule> mLoraModules

SizeType32 mMlpHiddenSize

SizeType32 mMaxLoraRank

std::optional<RnnConfig> mRnnConfig

KVCacheType mKVCacheType = KVCacheType::kCONTINUOUS 

SizeType32 mMaxEncoderLen = {}

SizeType32 mEncoderHiddenSize = {}

bool mUseCrossAttention

bool mUsePositionEmbedding

bool mUseTokenTypeEmbedding

std::vector<LayerType> mLayerTypes

std::shared_ptr<SpeculativeDecodingModule> mSpeculativeDecodingModule

SpeculativeDecodingMode mSpeculativeDecodingMode

nvinfer1::DataType mLogitsDtype

bool mUseShapeInference

ManageWeightsType mManageWeightsType

std::string mModelName

std::vector<SizeType32> mNumKvHeadsPerAttentionLayer

std::vector<SizeType32> mNumKvHeadsPerCrossAttentionLayer

bool mSkipCrossAttnBlocks

struct RnnConfig

公共成员

SizeType32 stateSize = 0

SizeType32 convKernel = 0

SizeType32 rnnHiddenSize = 0

SizeType32 rnnHeadSize = 0

SizeType32 rnnConvDimSize = 0

promptTuningParams.h

namespace tensorrt_llm

namespace runtime

template<typename TTensor> class GenericPromptTuningParams

公共类型

using TensorPtr = TTensor 

using SizeType32 = tensorrt_llm::runtime::SizeType32 

公共函数

inline explicit GenericPromptTuningParams(TensorPtr embeddingTable = TensorPtr(), TensorPtr tasks = TensorPtr(), TensorPtr vocabSize = TensorPtr())

公共成员

TensorPtr embeddingTable

TensorPtr tasks

TensorPtr vocabSize

std::vector<bool> promptTuningEnabled

class PromptTuningParams : public tensorrt_llm::runtime::GenericPromptTuningParams<ITensor::SharedPtr>

公共类型

using TensorPtr = ITensor::SharedPtr 

using SizeType32 = GenericPromptTuningParams::SizeType32 

公共函数

inline explicit PromptTuningParams(TensorPtr embeddingTable = nullptr, TensorPtr tasks = nullptr, TensorPtr vocabSize = nullptr)

void fillTasksTensor(TensorPtr tasksHost, const SizeType32 batchSize, const SizeType32 numContextRequests, std::vector<SizeType32> const &reqBeamWidths, std::vector<SizeType32> const &reqPromptLengths, BufferManager const &manager, bool packedInput)

rawEngine.h

namespace tensorrt_llm

namespace runtime

class RawEngine

公共类型

enum Type

值：

enumerator FilePath

enumerator AddressWithSize

enumerator HostMemory

公共函数

inline explicit RawEngine(std::filesystem::path enginePath) noexcept

inline explicit RawEngine(void const *engineAddr, std::size_t engineSize) noexcept

inline explicit RawEngine(nvinfer1::IHostMemory const *engineBuffer) noexcept

inline 类型 getType() const

inline std::filesystem::path getPath() const

inline std::optional<std::filesystem::path> getPathOpt() const

inline void setPath(std::filesystem::path enginePath)

inline std::optional<std::map<std::string, tensorrt_llm::执行器::张量>> const &getManagedWeightsMapOpt() const

inline void setManagedWeightsMap(std::map<std::string, tensorrt_llm::执行器::张量> managedWeightsMap)

inline void const *getAddress() const

inline std::size_t getSize() const

inline nvinfer1::IHostMemory const *getHostMemory() const

公共成员

void const *mEngineAddr = {}

std::size_t mEngineSize = {}

私有成员

类型 mType

std::optional<std::filesystem::path> mEnginePath

struct tensorrt_llm::runtime::RawEngine

nvinfer1::IHostMemory const *mEngineBuffer = {}

std::optional<std::map<std::string, tensorrt_llm::执行器::张量>> mManagedWeightsMap

request.h

namespace tensorrt_llm

namespace runtime

namespace decoder_batch

class Request

公共类型

using TensorConstPtr = ITensor::SharedConstPtr 

using TensorPtr = ITensor::SharedPtr 

using BufferPtr = IBuffer::SharedPtr 

公共函数

inline explicit Request(TensorConstPtr ids, SizeType32 inputLen, std::optional<SizeType32> maxNewTokens = std::nullopt, std::optional<SizeType32> endId = std::nullopt)

公共成员

TensorConstPtr ids

SizeType32 inputLen

std::optional<SizeType32> maxNewTokens

std::optional<SizeType32> endId

BufferPtr draftTokens

std::optional<TensorPtr> draftLogits

TensorPtr embeddingBias

TensorPtr badWordsList

TensorPtr stopWordsList

SizeType32 generatedTokensPerEngineStep

TensorPtr medusaPaths

TensorPtr medusaTreeIds

std::optional<执行器::LookaheadDecodingConfig> lookaheadRuntimeConfig

std::optional<执行器::EagleConfig> eagleConfig

nvinfer1::DataType dtype

runtimeDefaults.h

namespace tensorrt_llm

namespace runtime

struct RuntimeDefaults

公共函数

inline RuntimeDefaults(std::optional<std::vector<SizeType32>> maxAttentionWindowVec, std::optional<SizeType32> sinkTokenLength)

RuntimeDefaults() = default

公共成员

std::optional<std::vector<SizeType32>> maxAttentionWindowVec

std::optional<SizeType32> sinkTokenLength

samplingConfig.h

定义

SET_FROM_OPTIONAL(varName, VarName, VarType)

namespace tensorrt_llm

namespace runtime

class SamplingConfig

公共函数

inline explicit SamplingConfig(SizeType32 beamWidth = 1)

inline explicit SamplingConfig(std::vector<SamplingConfig> const &configs)

inline explicit SamplingConfig(执行器::SamplingConfig const &samplingConfig, std::optional<执行器::ExternalDraftTokensConfig> const &externalDraftTokensConfig)

inline bool validate()

inline bool operator==(SamplingConfig const &other) const

inline SizeType32 getNumReturnBeams() const

公共成员

SizeType32 beamWidth

std::optional<SizeType32> numReturnSequences

OptVec<FloatType> temperature

OptVec<FloatType> originalTemperature

OptVec<SizeType32> minLength

OptVec<FloatType> repetitionPenalty

OptVec<FloatType> presencePenalty

OptVec<FloatType> frequencyPenalty

OptVec<SizeType32> noRepeatNgramSize

OptVec<bool> outputLogProbs

OptVec<bool> cumLogProbs

OptVec<SizeType32> topK

OptVec<FloatType> topP

OptVec<uint64_t> randomSeed

OptVec<FloatType> topPDecay

OptVec<FloatType> topPMin

OptVec<TokenIdType> topPResetIds

OptVec<FloatType> beamSearchDiversityRate

OptVec<FloatType> lengthPenalty

OptVec<SizeType32> earlyStopping

OptVec<FloatType> draftAcceptanceThreshold

OptVec<std::vector<runtime::SizeType32>> topKMedusaHeads

std::optional<bool> normalizeLogProbs

私有类型

using FloatType = float

template<typename T> using OptVec = std::optional<std::vector<T>>

template<typename T> using Vec = std::vector<T>

私有函数

template<typename T> inline bool validateVec(std::string name, OptVec<T> const &vec, T min, std::optional<T> max = std::nullopt)

私有静态函数

template<typename T> static inline OptVec<T> fuseValues(std::vector<SamplingConfig> const &configs, std::function<OptVec<T>(size_t ci)> accessor, T defaultValue)

speculativeDecodingMode.h

namespace tensorrt_llm

namespace runtime

class SpeculativeDecodingMode

公共类型

using UnderlyingType = std::uint8_t

公共函数

inline bool constexpr isNone() const

inline bool constexpr isDraftTokensExternal() const

inline bool constexpr isMedusa() const

inline bool constexpr isLookaheadDecoding() const

inline bool constexpr isExplicitDraftTokens() const

inline bool constexpr isEagle() const

inline bool constexpr updatesPositionIds() const

inline bool constexpr requiresAttentionMask() const

inline bool constexpr predictsDraftTokens() const

inline bool constexpr needsKVCacheRewind() const

inline bool constexpr variableDraftLength() const

inline bool constexpr hasDraftLogits() const

inline bool constexpr needsDecoderPrologue() const

inline bool operator==(SpeculativeDecodingMode const &other) const

inline explicit constexpr SpeculativeDecodingMode(基础类型 state)

公共静态函数

static inline auto constexpr None()

static inline auto constexpr DraftTokensExternal()

static inline auto constexpr Medusa()

static inline auto constexpr LookaheadDecoding()

static inline auto constexpr ExplicitDraftTokens()

static inline auto constexpr Eagle()

私有函数

inline bool constexpr anyBitSet(基础类型 bits) const

inline bool constexpr allBitSet(基础类型 bits) const

私有成员

基础类型 mState = {kNone}

私有静态属性

static 基础类型 constexpr kNone = {1U << 0U}

static 基础类型 constexpr kDraftTokensExternal = {1U << 1U}

static 基础类型 constexpr kMedusa = {1U << 2U}

static 基础类型 constexpr kLookaheadDecoding = {1U << 3U}

static 基础类型 constexpr kExplicitDraftTokens = {1U << 4U}

static 基础类型 constexpr kEagle = {1U << 5U}

speculativeDecodingModule.h

namespace tensorrt_llm

namespace runtime

class SpeculativeDecodingModule

由tensorrt_llm::runtime::LookaheadModule, tensorrt_llm::runtime::MedusaModule子类化

公共函数

inline explicit SpeculativeDecodingModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens, SizeType32 maxNumPaths) noexcept

inline explicit SpeculativeDecodingModule() noexcept

virtual ~SpeculativeDecodingModule() = default

SpeculativeDecodingModule(SpeculativeDecodingModule const &o) = default

SpeculativeDecodingModule &operator=(SpeculativeDecodingModule const &o) = default

inline SizeType32 getMaxDraftPathLen() const noexcept

Returns:: 解码器每一步可以接受的最大草稿令牌数

inline SizeType32 getMaxPathLen() const noexcept

比主头预测的路径长度多一个

Returns:: 解码器一步中请求可以增长的最大令牌数

inline SizeType32 getMaxDecodingDraftTokens() const noexcept

Returns:: 解码器一步处理的最大草稿令牌数

inline SizeType32 getMaxDecodingTokens() const noexcept

比从主头解码草稿令牌进行预测多一个

Returns:: 解码器每一步处理的最大令牌数

inline SizeType32 getNumPackedMasks() const noexcept

inline SizeType32 getMaxNumPaths() const noexcept

inline void setMaxDraftTokens(SizeType32 maxDraftTokens) noexcept

inline void setMaxDraftPathLen(SizeType32 maxDraftPathLen) noexcept

inline void setMaxNumPaths(SizeType32 maxNumPaths) noexcept

私有函数

inline void computeNumPackedMasks() noexcept

私有成员

SizeType32 mMaxDraftPathLen

SizeType32 mMaxDecodingDraftTokens

SizeType32 mMaxNumPaths

SizeType32 mMaxNumPackedMasks

tllmLogger.h

namespace tensorrt_llm

namespace runtime

class TllmLogger : public nvinfer1::ILogger

公共函数

void log(Severity severity, nvinfer1::AsciiChar const *msg) noexcept override

Severity getLevel()

void setLevel(Severity level)

worldConfig.h

namespace tensorrt_llm

namespace runtime

class WorldConfig

公共函数

explicit WorldConfig(SizeType32 tensorParallelism = 1, SizeType32 pipelineParallelism = 1, SizeType32 contextParallelism = 1, SizeType32 rank = 0, SizeType32 gpusPerNode = kDefaultGpusPerNode, std::optional<std::vector<SizeType32>> const &deviceIds = std::nullopt)

inline SizeType32 constexpr getSize() const noexcept

inline SizeType32 constexpr getTensorParallelism() const noexcept

inline bool constexpr isTensorParallel() const noexcept

inline SizeType32 constexpr getPipelineParallelism() const noexcept

inline bool constexpr isPipelineParallel() const noexcept

inline SizeType32 constexpr getContextParallelism() const noexcept

inline bool constexpr isContextParallel() const noexcept

inline SizeType32 constexpr getRank() const noexcept

inline SizeType32 constexpr getGpusPerNode() const noexcept

inline SizeType32 getGpusPerGroup() const noexcept

inline SizeType32 getDevice() const noexcept

inline SizeType32 getDeviceOf(SizeType32 rank) const noexcept

inline SizeType32 constexpr getPipelineParallelRank() const noexcept

inline SizeType32 constexpr getTensorParallelRank() const noexcept

inline SizeType32 constexpr getContextParallelRank() const noexcept

inline SizeType32 constexpr getLocalRank() const noexcept

inline SizeType32 constexpr getNodeRank() const noexcept

inline SizeType32 constexpr getNodeRankOf(SizeType32 rank) const noexcept

inline bool constexpr isFirstPipelineParallelRank() const noexcept

inline bool constexpr isLastPipelineParallelRank() const noexcept: 我的排名是其管道中的最后一个排名吗？

inline bool constexpr isFirstTensorParallelRank() const noexcept

inline bool constexpr isFirstContextParallelRank() const noexcept

inline SizeType32 constexpr getLastRank() const noexcept

std::vector<SizeType32> getPipelineParallelGroup() const

std::vector<SizeType32> getTensorParallelGroup() const

std::vector<SizeType32> getContextParallelGroup() const

bool validMpiConfig() const

公共静态函数

static WorldConfig mpi(SizeType32 gpusPerNode = kDefaultGpusPerNode, std::optional<SizeType32> tensorParallelism = std::nullopt, std::optional<SizeType32> pipelineParallelism = std::nullopt, std::optional<SizeType32> contextParallelism = std::nullopt, std::optional<std::vector<SizeType32>> const &deviceIds = std::nullopt)

公共静态属性

static SizeType32 constexpr kDefaultGpusPerNode = 1

私有成员

SizeType32 mTensorParallelism

SizeType32 mPipelineParallelism

SizeType32 mContextParallelism

SizeType32 mRank

SizeType32 mGpusPerNode

std::vector<SizeType32> mDeviceIds