vllm.entrypoints.pooling.embed.protocol ¶

EmbeddingRequest `module-attribute` ¶

EmbeddingRequest: TypeAlias = (
    EmbeddingCompletionRequest | EmbeddingChatRequest
)

EmbeddingBytesResponse ¶

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/pooling/embed/protocol.py

class EmbeddingBytesResponse(OpenAIBaseModel):
    content: list[bytes]
    headers: dict[str, str] | None = None
    media_type: str = "application/octet-stream"

content `instance-attribute` ¶

content: list[bytes]

headers `class-attribute` `instance-attribute` ¶

headers: dict[str, str] | None = None

media_type `class-attribute` `instance-attribute` ¶

media_type: str = 'application/octet-stream'

EmbeddingChatRequest ¶

Bases: PoolingBasicRequestMixin

Source code in vllm/entrypoints/pooling/embed/protocol.py

class EmbeddingChatRequest(PoolingBasicRequestMixin):
    messages: list[ChatCompletionMessageParam]

    encoding_format: EncodingFormat = "float"
    dimensions: int | None = None

    # --8<-- [start:chat-embedding-extra-params]
    add_generation_prompt: bool = Field(
        default=False,
        description=(
            "If true, the generation prompt will be added to the chat template. "
            "This is a parameter used by chat template in tokenizer config of the "
            "model."
        ),
    )
    continue_final_message: bool = Field(
        default=False,
        description=(
            "If this is set, the chat will be formatted so that the final "
            "message in the chat is open-ended, without any EOS tokens. The "
            "model will continue this message rather than starting a new one. "
            'This allows you to "prefill" part of the model\'s response for it. '
            "Cannot be used at the same time as `add_generation_prompt`."
        ),
    )
    add_special_tokens: bool = Field(
        default=False,
        description=(
            "If true, special tokens (e.g. BOS) will be added to the prompt "
            "on top of what is added by the chat template. "
            "For most models, the chat template takes care of adding the "
            "special tokens so this should be set to false (as is the "
            "default)."
        ),
    )
    chat_template: str | None = Field(
        default=None,
        description=(
            "A Jinja template to use for this conversion. "
            "As of transformers v4.44, default chat template is no longer "
            "allowed, so you must provide a chat template if the tokenizer "
            "does not define one."
        ),
    )
    chat_template_kwargs: dict[str, Any] | None = Field(
        default=None,
        description=(
            "Additional keyword args to pass to the template renderer. "
            "Will be accessible by the chat template."
        ),
    )
    mm_processor_kwargs: dict[str, Any] | None = Field(
        default=None,
        description=("Additional kwargs to pass to the HF processor."),
    )
    normalize: bool | None = Field(
        default=None,
        description="Whether to normalize the embeddings outputs. Default is True.",
    )
    embed_dtype: EmbedDType = Field(
        default="float32",
        description=(
            "What dtype to use for encoding. Default to using float32 for base64 "
            "encoding to match the OpenAI python client behavior. "
            "This parameter will affect base64 and binary_response."
        ),
    )
    endianness: Endianness = Field(
        default="native",
        description=(
            "What endianness to use for encoding. Default to using native for "
            "base64 encoding to match the OpenAI python client behavior."
            "This parameter will affect base64 and binary_response."
        ),
    )
    # --8<-- [end:chat-embedding-extra-params]

    @model_validator(mode="before")
    @classmethod
    def check_generation_prompt(cls, data):
        if data.get("continue_final_message") and data.get("add_generation_prompt"):
            raise ValueError(
                "Cannot set both `continue_final_message` and "
                "`add_generation_prompt` to True."
            )
        return data

    def to_pooling_params(self):
        return PoolingParams(
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            dimensions=self.dimensions,
            use_activation=self.normalize,
        )

add_generation_prompt `class-attribute` `instance-attribute` ¶

add_generation_prompt: bool = Field(
    default=False,
    description="If true, the generation prompt will be added to the chat template. This is a parameter used by chat template in tokenizer config of the model.",
)

add_special_tokens `class-attribute` `instance-attribute` ¶

add_special_tokens: bool = Field(
    default=False,
    description="If true, special tokens (e.g. BOS) will be added to the prompt on top of what is added by the chat template. For most models, the chat template takes care of adding the special tokens so this should be set to false (as is the default).",
)

chat_template `class-attribute` `instance-attribute` ¶

chat_template: str | None = Field(
    default=None,
    description="A Jinja template to use for this conversion. As of transformers v4.44, default chat template is no longer allowed, so you must provide a chat template if the tokenizer does not define one.",
)

chat_template_kwargs `class-attribute` `instance-attribute` ¶

chat_template_kwargs: dict[str, Any] | None = Field(
    default=None,
    description="Additional keyword args to pass to the template renderer. Will be accessible by the chat template.",
)

continue_final_message `class-attribute` `instance-attribute` ¶

continue_final_message: bool = Field(
    default=False,
    description='If this is set, the chat will be formatted so that the final message in the chat is open-ended, without any EOS tokens. The model will continue this message rather than starting a new one. This allows you to "prefill" part of the model\'s response for it. Cannot be used at the same time as `add_generation_prompt`.',
)

dimensions `class-attribute` `instance-attribute` ¶

dimensions: int | None = None

embed_dtype `class-attribute` `instance-attribute` ¶

embed_dtype: EmbedDType = Field(
    default="float32",
    description="What dtype to use for encoding. Default to using float32 for base64 encoding to match the OpenAI python client behavior. This parameter will affect base64 and binary_response.",
)

encoding_format `class-attribute` `instance-attribute` ¶

encoding_format: EncodingFormat = 'float'

endianness `class-attribute` `instance-attribute` ¶

endianness: Endianness = Field(
    default="native",
    description="What endianness to use for encoding. Default to using native for base64 encoding to match the OpenAI python client behavior.This parameter will affect base64 and binary_response.",
)

messages `instance-attribute` ¶

messages: list[ChatCompletionMessageParam]

mm_processor_kwargs `class-attribute` `instance-attribute` ¶

mm_processor_kwargs: dict[str, Any] | None = Field(
    default=None,
    description="Additional kwargs to pass to the HF processor.",
)

normalize `class-attribute` `instance-attribute` ¶

normalize: bool | None = Field(
    default=None,
    description="Whether to normalize the embeddings outputs. Default is True.",
)

check_generation_prompt `classmethod` ¶

check_generation_prompt(data)

Source code in vllm/entrypoints/pooling/embed/protocol.py

@model_validator(mode="before")
@classmethod
def check_generation_prompt(cls, data):
    if data.get("continue_final_message") and data.get("add_generation_prompt"):
        raise ValueError(
            "Cannot set both `continue_final_message` and "
            "`add_generation_prompt` to True."
        )
    return data

to_pooling_params ¶

to_pooling_params()

Source code in vllm/entrypoints/pooling/embed/protocol.py

def to_pooling_params(self):
    return PoolingParams(
        truncate_prompt_tokens=self.truncate_prompt_tokens,
        dimensions=self.dimensions,
        use_activation=self.normalize,
    )

EmbeddingCompletionRequest ¶

Bases: PoolingBasicRequestMixin, CompletionRequestMixin

Source code in vllm/entrypoints/pooling/embed/protocol.py

class EmbeddingCompletionRequest(PoolingBasicRequestMixin, CompletionRequestMixin):
    # Ordered by official OpenAI API documentation
    # https://platform.openai.com/docs/api-reference/embeddings

    encoding_format: EncodingFormat = "float"
    dimensions: int | None = None

    # --8<-- [start:embedding-extra-params]
    normalize: bool | None = Field(
        default=None,
        description="Whether to normalize the embeddings outputs. Default is True.",
    )
    embed_dtype: EmbedDType = Field(
        default="float32",
        description=(
            "What dtype to use for encoding. Default to using float32 for base64 "
            "encoding to match the OpenAI python client behavior. "
            "This parameter will affect base64 and binary_response."
        ),
    )
    endianness: Endianness = Field(
        default="native",
        description=(
            "What endianness to use for encoding. Default to using native for "
            "base64 encoding to match the OpenAI python client behavior."
            "This parameter will affect base64 and binary_response."
        ),
    )
    # --8<-- [end:embedding-extra-params]

    def to_pooling_params(self):
        return PoolingParams(
            dimensions=self.dimensions,
            use_activation=self.normalize,
            truncate_prompt_tokens=self.truncate_prompt_tokens,
        )

dimensions `class-attribute` `instance-attribute` ¶

dimensions: int | None = None

embed_dtype `class-attribute` `instance-attribute` ¶

embed_dtype: EmbedDType = Field(
    default="float32",
    description="What dtype to use for encoding. Default to using float32 for base64 encoding to match the OpenAI python client behavior. This parameter will affect base64 and binary_response.",
)

encoding_format `class-attribute` `instance-attribute` ¶

encoding_format: EncodingFormat = 'float'

endianness `class-attribute` `instance-attribute` ¶

endianness: Endianness = Field(
    default="native",
    description="What endianness to use for encoding. Default to using native for base64 encoding to match the OpenAI python client behavior.This parameter will affect base64 and binary_response.",
)

normalize `class-attribute` `instance-attribute` ¶

normalize: bool | None = Field(
    default=None,
    description="Whether to normalize the embeddings outputs. Default is True.",
)

to_pooling_params ¶

to_pooling_params()

Source code in vllm/entrypoints/pooling/embed/protocol.py

def to_pooling_params(self):
    return PoolingParams(
        dimensions=self.dimensions,
        use_activation=self.normalize,
        truncate_prompt_tokens=self.truncate_prompt_tokens,
    )

EmbeddingResponse ¶

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/pooling/embed/protocol.py

class EmbeddingResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
    object: str = "list"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    data: list[EmbeddingResponseData]
    usage: UsageInfo

created `class-attribute` `instance-attribute` ¶

created: int = Field(default_factory=lambda: int(time()))

data `instance-attribute` ¶

data: list[EmbeddingResponseData]

id `class-attribute` `instance-attribute` ¶

id: str = Field(
    default_factory=lambda: f"embd-{random_uuid()}"
)

model `instance-attribute` ¶

model: str

object `class-attribute` `instance-attribute` ¶

object: str = 'list'

usage `instance-attribute` ¶

usage: UsageInfo

EmbeddingResponseData ¶

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/pooling/embed/protocol.py

class EmbeddingResponseData(OpenAIBaseModel):
    index: int
    object: str = "embedding"
    embedding: list[float] | str

embedding `instance-attribute` ¶

embedding: list[float] | str

index `instance-attribute` ¶

index: int

object `class-attribute` `instance-attribute` ¶

object: str = 'embedding'

vllm.entrypoints.pooling.embed.protocol ¶

EmbeddingRequest module-attribute ¶

EmbeddingBytesResponse ¶

content instance-attribute ¶

headers class-attribute instance-attribute ¶

media_type class-attribute instance-attribute ¶

EmbeddingChatRequest ¶

add_generation_prompt class-attribute instance-attribute ¶

add_special_tokens class-attribute instance-attribute ¶

chat_template class-attribute instance-attribute ¶

chat_template_kwargs class-attribute instance-attribute ¶

continue_final_message class-attribute instance-attribute ¶

dimensions class-attribute instance-attribute ¶

embed_dtype class-attribute instance-attribute ¶

encoding_format class-attribute instance-attribute ¶

endianness class-attribute instance-attribute ¶

messages instance-attribute ¶

mm_processor_kwargs class-attribute instance-attribute ¶

normalize class-attribute instance-attribute ¶

check_generation_prompt classmethod ¶

to_pooling_params ¶

EmbeddingCompletionRequest ¶

dimensions class-attribute instance-attribute ¶

embed_dtype class-attribute instance-attribute ¶

encoding_format class-attribute instance-attribute ¶

endianness class-attribute instance-attribute ¶

normalize class-attribute instance-attribute ¶

to_pooling_params ¶

EmbeddingResponse ¶

created class-attribute instance-attribute ¶

data instance-attribute ¶

id class-attribute instance-attribute ¶

model instance-attribute ¶

object class-attribute instance-attribute ¶

usage instance-attribute ¶

EmbeddingResponseData ¶

embedding instance-attribute ¶

index instance-attribute ¶

object class-attribute instance-attribute ¶

EmbeddingRequest `module-attribute` ¶

content `instance-attribute` ¶

headers `class-attribute` `instance-attribute` ¶

media_type `class-attribute` `instance-attribute` ¶

add_generation_prompt `class-attribute` `instance-attribute` ¶

add_special_tokens `class-attribute` `instance-attribute` ¶

chat_template `class-attribute` `instance-attribute` ¶

chat_template_kwargs `class-attribute` `instance-attribute` ¶

continue_final_message `class-attribute` `instance-attribute` ¶

dimensions `class-attribute` `instance-attribute` ¶

embed_dtype `class-attribute` `instance-attribute` ¶

encoding_format `class-attribute` `instance-attribute` ¶

endianness `class-attribute` `instance-attribute` ¶

messages `instance-attribute` ¶

mm_processor_kwargs `class-attribute` `instance-attribute` ¶

normalize `class-attribute` `instance-attribute` ¶

check_generation_prompt `classmethod` ¶

dimensions `class-attribute` `instance-attribute` ¶

embed_dtype `class-attribute` `instance-attribute` ¶

encoding_format `class-attribute` `instance-attribute` ¶

endianness `class-attribute` `instance-attribute` ¶

normalize `class-attribute` `instance-attribute` ¶

created `class-attribute` `instance-attribute` ¶

data `instance-attribute` ¶

id `class-attribute` `instance-attribute` ¶

model `instance-attribute` ¶

object `class-attribute` `instance-attribute` ¶

usage `instance-attribute` ¶

embedding `instance-attribute` ¶

index `instance-attribute` ¶

object `class-attribute` `instance-attribute` ¶