Postgresml

PostgresML索引 #

基类：EventBaseManagedIndex

PostgresML 索引。

PostgresML 索引实现了一个使用 PostgresML 作为后端的托管索引。 PostgresML 在后端执行传统索引中的许多功能： - 将文档分解为块（节点） - 为每个块（节点）创建嵌入向量 - 执行搜索以查找与查询最相似的前 k 个节点 - 可选择执行文本生成或聊天补全

workflows/handler.py 中的源代码llama_index/indices/managed/postgresml/base.py

class PostgresMLIndex(BaseManagedIndex):
    """
    PostgresML Index.

    The PostgresML index implements a managed index that uses PostgresML as the backend.
    PostgresML performs a lot of the functions in traditional indexes in the backend:
    - breaks down a document into chunks (nodes)
    - Creates the embedding for each chunk (node)
    - Performs the search for the top k most similar nodes to a query
    - Optionally can perform text-generation or chat completion
    """

    def __init__(
        self,
        collection_name: str,
        pipeline_name: Optional[str] = None,
        pipeline_schema: Optional[Dict[str, Any]] = None,
        pgml_database_url: Optional[str] = None,
        show_progress: bool = True,
        upsert_parallel_batches: int = 1,
        nodes: Optional[Sequence[BaseNode]] = None,
        **kwargs: Any,
    ) -> None:
        """Initialize the PostgresML SDK."""
        self.show_progress = show_progress
        self.upsert_parallel_batches = upsert_parallel_batches

        index_struct = PostgresMLIndexStruct(
            index_id=collection_name,
            summary="PostgresML Index",
        )

        super().__init__(
            show_progress=show_progress,
            index_struct=index_struct,
            **kwargs,
        )

        # Create our Collection and Pipeline
        self.collection = Collection(collection_name, pgml_database_url)
        if pipeline_name is None:
            pipeline_name = "v1"
        if pipeline_schema is None:
            pipeline_schema = {
                "content": {
                    "splitter": {
                        "model": "recursive_character",
                        "parameters": {"chunk_size": 1500},
                    },
                    "semantic_search": {
                        "model": "intfloat/e5-small-v2",
                        "parameters": {"prompt": "passage: "},
                    },
                }
            }
        self.pipeline = Pipeline(pipeline_name, pipeline_schema)

        # We must wrap self.collection.add_pipeline() with this async function
        # This is a limitation of the pyo3 async implementation
        async def add_pipeline():
            await self.collection.add_pipeline(self.pipeline)

        run_async_tasks([add_pipeline()])

        if nodes:
            self._insert(nodes)

    def _insert(
        self,
        nodes: Sequence[BaseNode],
        **insert_kwargs: Any,
    ) -> None:
        """Insert a set of documents (each a node)."""
        documents = [
            {
                "id": node.node_id,
                "content": node.get_content(),
                "metadata": node.metadata,
            }
            for node in nodes
        ]

        args = {"parallel_batches": self.upsert_parallel_batches, **insert_kwargs}

        # We must wrap self.collection.upsert_documents() with this async function
        # This is a limitation of the pyo3 async implementation
        async def upsert_documents():
            await self.collection.upsert_documents(documents, args)

        run_async_tasks([upsert_documents()])

    def add_documents(
        self,
        docs: Sequence[Document],
        **insert_kwargs: Any,
    ) -> None:
        nodes = [TextNode(**doc.dict()) for doc in docs]
        self._insert(nodes, **insert_kwargs)

    def delete_ref_doc(self, ref_doc_id: str) -> None:
        # We must wrap self.collection.delete_documents() with this async function
        # This is a limitation of the pyo3 async implementation
        async def delete_documents():
            await self.collection.delete_documents({"id": {"$eq": ref_doc_id}})

        run_async_tasks([delete_documents()])

    def update_ref_doc(self, document: Document) -> None:
        node = TextNode(**document.dict())
        self._insert([node], merge=True)

    def as_retriever(self, **kwargs: Any) -> BaseRetriever:
        """Return a Retriever for this managed index."""
        from llama_index.indices.managed.postgresml.retriever import (
            PostgresMLRetriever,
        )

        return PostgresMLRetriever(self, **kwargs)

    def as_query_engine(self, **kwargs: Any) -> BaseQueryEngine:
        from llama_index.indices.managed.postgresml.retriever import (
            PostgresMLRetriever,
        )
        from llama_index.indices.managed.postgresml.query import (
            PostgresMLQueryEngine,
        )

        return PostgresMLQueryEngine(PostgresMLRetriever(self, **kwargs), **kwargs)

    @classmethod
    def from_documents(
        cls: Type[IndexType],
        documents: Sequence[Document],
        collection_name: Optional[str] = None,
        pipeline_name: Optional[str] = None,
        pipeline_schema: Optional[Dict[str, Any]] = None,
        pgml_database_url: Optional[str] = None,
        show_progress: bool = False,
        upsert_parallel_batches: int = 1,
        **kwargs: Any,
    ) -> IndexType:
        """Build a PostgresML index from a sequence of documents."""
        if collection_name is None:
            raise Exception("collection_name is a required argument")
        nodes = [TextNode(**doc.dict()) for doc in documents]
        return cls(
            collection_name,
            pipeline_name=pipeline_name,
            pipeline_schema=pipeline_schema,
            pgml_database_url=pgml_database_url,
            nodes=nodes,
            show_progress=show_progress,
            upsert_parallel_batches=upsert_parallel_batches,
            **kwargs,
        )

as_retriever #

as_retriever(**kwargs: Any) -> BaseRetriever

返回此托管索引的检索器。

workflows/handler.py 中的源代码llama_index/indices/managed/postgresml/base.py

def as_retriever(self, **kwargs: Any) -> BaseRetriever:
    """Return a Retriever for this managed index."""
    from llama_index.indices.managed.postgresml.retriever import (
        PostgresMLRetriever,
    )

    return PostgresMLRetriever(self, **kwargs)

from_documents `classmethod` #

from_documents(documents: Sequence[文档], collection_name: Optional[str] = None, pipeline_name: Optional[str] = None, pipeline_schema: Optional[Dict[str, Any]] = None, pgml_database_url: Optional[str] = None, show_progress: bool = False, upsert_parallel_batches: int = 1, **kwargs: Any) -> IndexType

从一系列文档构建一个PostgresML索引。

workflows/handler.py 中的源代码llama_index/indices/managed/postgresml/base.py

@classmethod
def from_documents(
    cls: Type[IndexType],
    documents: Sequence[Document],
    collection_name: Optional[str] = None,
    pipeline_name: Optional[str] = None,
    pipeline_schema: Optional[Dict[str, Any]] = None,
    pgml_database_url: Optional[str] = None,
    show_progress: bool = False,
    upsert_parallel_batches: int = 1,
    **kwargs: Any,
) -> IndexType:
    """Build a PostgresML index from a sequence of documents."""
    if collection_name is None:
        raise Exception("collection_name is a required argument")
    nodes = [TextNode(**doc.dict()) for doc in documents]
    return cls(
        collection_name,
        pipeline_name=pipeline_name,
        pipeline_schema=pipeline_schema,
        pgml_database_url=pgml_database_url,
        nodes=nodes,
        show_progress=show_progress,
        upsert_parallel_batches=upsert_parallel_batches,
        **kwargs,
    )

PostgresML检索器 #

基类：EventBaseRetriever

PostgresML 检索器。

参数：

名称	类型	描述	默认
`index`	`PostgresMLIndex`	PostgresML 索引	required

workflows/handler.py 中的源代码llama_index/indices/managed/postgresml/retriever.py

class PostgresMLRetriever(BaseRetriever):
    """
    PostgresML Retriever.

    Args:
        index (PostgresMLIndex): the PostgresML Index

    """

    def __init__(
        self,
        index: PostgresMLIndex,
        callback_manager: Optional[CallbackManager] = None,
        pgml_query: Optional[Dict[str, Any]] = None,
        limit: Optional[int] = 5,
        rerank: Optional[Dict[str, Any]] = None,
        **kwargs,
    ) -> None:
        """Initialize params."""
        self._index = index
        self._pgml_query = pgml_query
        self._limit = limit
        self._rerank = rerank
        super().__init__(callback_manager)

    def _retrieve(
        self,
        query_bundle: Optional[QueryBundle] = None,
        **kwargs: Any,
    ) -> List[NodeWithScore]:
        return run_async_tasks([self._aretrieve(query_bundle, **kwargs)])[0]

    async def _aretrieve(
        self,
        query_bundle: Optional[QueryBundle] = None,
    ) -> List[NodeWithScore]:
        async def do_vector_search():
            if self._pgml_query:
                return await self._index.collection.vector_search(
                    self._pgml_query,
                    self._index.pipeline,
                )
            else:
                if not query_bundle:
                    raise Exception(
                        "Must provide either query or query_bundle to retrieve and aretrieve"
                    )
                if self._rerank is not None:
                    self._rerank = self._rerank | {"query": query_bundle.query_str}
                return await self._index.collection.vector_search(
                    {
                        "query": {
                            "fields": {
                                "content": {
                                    "query": query_bundle.query_str,
                                    "parameters": {"prompt": "query: "},
                                }
                            }
                        },
                        "rerank": self._rerank,
                        "limit": self._limit,
                    },
                    self._index.pipeline,
                )

        results = await do_vector_search()
        return [
            NodeWithScore(
                node=TextNode(
                    id_=r["document"]["id"],
                    text=r["chunk"],
                    metadata=r["document"]["metadata"],
                ),
                score=r["score"],
            )
            if self._rerank is None
            else NodeWithScore(
                node=TextNode(
                    id_=r["document"]["id"],
                    text=r["chunk"],
                    metadata=r["document"]["metadata"],
                ),
                score=r["rerank_score"],
            )
            for r in results
        ]

选项：成员：- PostgresMLRetriever - PostgresmlIndex

Postgresml

PostgresML索引 #

as_retriever #

from_documents classmethod #

PostgresML检索器 #

from_documents `classmethod` #