跳转到内容

`ObjectIndex` 类

ObjectIndex 类允许对任意 Python 对象进行索引。因此,它非常灵活,适用于广泛的用例。例如:

要构建一个ObjectIndex,我们需要一个索引以及另一个抽象概念,即ObjectNodeMapping。顾名思义,这种映射提供了在节点与关联对象之间相互转换的方法。另外,还存在一个from_objects()类方法,可以方便地从一组对象构建ObjectIndex

在本笔记本中,我们将快速介绍如何使用 SimpleObjectNodeMapping 构建一个 ObjectIndex

from llama_index.core import Settings
Settings.embed_model = "local"
from llama_index.core import VectorStoreIndex
from llama_index.core.objects import ObjectIndex, SimpleObjectNodeMapping
# some really arbitrary objects
obj1 = {"input": "Hey, how's it going"}
obj2 = ["a", "b", "c", "d"]
obj3 = "llamaindex is an awesome library!"
arbitrary_objects = [obj1, obj2, obj3]
# (optional) object-node mapping
obj_node_mapping = SimpleObjectNodeMapping.from_objects(arbitrary_objects)
nodes = obj_node_mapping.to_nodes(arbitrary_objects)
# object index
object_index = ObjectIndex(
index=VectorStoreIndex(nodes=nodes),
object_node_mapping=obj_node_mapping,
)
# object index from_objects (default index_cls=VectorStoreIndex)
object_index = ObjectIndex.from_objects(
arbitrary_objects, index_cls=VectorStoreIndex
)

有了object_index,我们可以将其用作检索器,对索引对象进行检索。

object_retriever = object_index.as_retriever(similarity_top_k=1)
object_retriever.retrieve("llamaindex")
['llamaindex is an awesome library!']

我们还可以向对象索引检索器添加节点后处理器,以便轻松实现诸如重排器等更多功能。

%pip install llama-index-postprocessor-colbert-rerank
from llama_index.postprocessor.colbert_rerank import ColbertRerank
retriever = object_index.as_retriever(
similarity_top_k=2, node_postprocessors=[ColbertRerank(top_n=1)]
)
retriever.retrieve("a random list object")
['llamaindex is an awesome library!']

对象索引支持与LlamaIndex中任何现有存储后端的集成。

以下部分将逐步介绍如何使用 Chroma 作为示例进行设置。

%pip install llama-index-vector-stores-chroma
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("quickstart2")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
object_index = ObjectIndex.from_objects(
arbitrary_objects,
index_cls=VectorStoreIndex,
storage_context=storage_context,
)
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[31], line 5
2 from llama_index.vector_stores.chroma import ChromaVectorStore
3 import chromadb
----> 5 db = chromadb.PersistentClient(path="./chroma_db2")
6 chroma_collection = db.get_or_create_collection("quickstart2")
7 vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
File ~/giant_change/llama_index/venv/lib/python3.10/site-packages/chromadb/__init__.py:146, in PersistentClient(path, settings, tenant, database)
143 tenant = str(tenant)
144 database = str(database)
--> 146 return ClientCreator(tenant=tenant, database=database, settings=settings)
File ~/giant_change/llama_index/venv/lib/python3.10/site-packages/chromadb/api/client.py:139, in Client.__init__(self, tenant, database, settings)
133 def __init__(
134 self,
135 tenant: str = DEFAULT_TENANT,
136 database: str = DEFAULT_DATABASE,
137 settings: Settings = Settings(),
138 ) -> None:
--> 139 super().__init__(settings=settings)
140 self.tenant = tenant
141 self.database = database
File ~/giant_change/llama_index/venv/lib/python3.10/site-packages/chromadb/api/client.py:43, in SharedSystemClient.__init__(self, settings)
38 def __init__(
39 self,
40 settings: Settings = Settings(),
41 ) -> None:
42 self._identifier = SharedSystemClient._get_identifier_from_settings(settings)
---> 43 SharedSystemClient._create_system_if_not_exists(self._identifier, settings)
File ~/giant_change/llama_index/venv/lib/python3.10/site-packages/chromadb/api/client.py:54, in SharedSystemClient._create_system_if_not_exists(cls, identifier, settings)
51 cls._identifer_to_system[identifier] = new_system
53 new_system.instance(ProductTelemetryClient)
---> 54 new_system.instance(ServerAPI)
56 new_system.start()
57 else:
File ~/giant_change/llama_index/venv/lib/python3.10/site-packages/chromadb/config.py:382, in System.instance(self, type)
379 type = get_class(fqn, type)
381 if type not in self._instances:
--> 382 impl = type(self)
383 self._instances[type] = impl
384 if self._running:
File ~/giant_change/llama_index/venv/lib/python3.10/site-packages/chromadb/api/segment.py:102, in SegmentAPI.__init__(self, system)
100 super().__init__(system)
101 self._settings = system.settings
--> 102 self._sysdb = self.require(SysDB)
103 self._manager = self.require(SegmentManager)
104 self._quota = self.require(QuotaEnforcer)
File ~/giant_change/llama_index/venv/lib/python3.10/site-packages/chromadb/config.py:281, in Component.require(self, type)
278 def require(self, type: Type[T]) -> T:
279 """Get a Component instance of the given type, and register as a dependency of
280 that instance."""
--> 281 inst = self._system.instance(type)
282 self._dependencies.add(inst)
283 return inst
File ~/giant_change/llama_index/venv/lib/python3.10/site-packages/chromadb/config.py:382, in System.instance(self, type)
379 type = get_class(fqn, type)
381 if type not in self._instances:
--> 382 impl = type(self)
383 self._instances[type] = impl
384 if self._running:
File ~/giant_change/llama_index/venv/lib/python3.10/site-packages/chromadb/db/impl/sqlite.py:88, in SqliteDB.__init__(self, system)
84 self._db_file = (
85 self._settings.require("persist_directory") + "/chroma.sqlite3"
86 )
87 if not os.path.exists(self._db_file):
---> 88 os.makedirs(os.path.dirname(self._db_file), exist_ok=True)
89 self._conn_pool = PerThreadPool(self._db_file)
90 self._tx_stack = local()
File ~/miniforge3/lib/python3.10/os.py:225, in makedirs(name, mode, exist_ok)
223 return
224 try:
--> 225 mkdir(name, mode)
226 except OSError:
227 # Cannot rely on checking for EEXIST, since the operating system
228 # could give priority to other errors like EACCES or EROFS
229 if not exist_ok or not path.isdir(name):
FileNotFoundError: [Errno 2] No such file or directory: './chroma_db2'
object_retriever = object_index.as_retriever(similarity_top_k=1)
object_retriever.retrieve("llamaindex")
['llamaindex is an awesome library!']

现在,让我们“重新加载”索引

db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
object_index = ObjectIndex.from_objects_and_index(arbitrary_objects, index)
object_retriever = object_index.as_retriever(similarity_top_k=1)
object_retriever.retrieve("llamaindex")
['llamaindex is an awesome library!']

请注意,当我们重新加载索引时,仍然需要传递对象,因为这些对象并未保存在实际的索引/向量数据库中。

对于需要完全控制对象如何映射到节点的特殊场景,您也可以提供 to_node_fn()from_node_fn() 钩子函数。

这在您转换特殊对象时非常有用,或者当您希望在运行时动态创建对象而不是将其保留在内存中时。

下面展示一个小示例。

from llama_index.core.schema import TextNode
my_objects = {
str(hash(str(obj))): obj for i, obj in enumerate(arbitrary_objects)
}
def from_node_fn(node):
return my_objects[node.id]
def to_node_fn(obj):
return TextNode(id=str(hash(str(obj))), text=str(obj))
object_index = ObjectIndex.from_objects(
arbitrary_objects,
index_cls=VectorStoreIndex,
from_node_fn=from_node_fn,
to_node_fn=to_node_fn,
)
object_retriever = object_index.as_retriever(similarity_top_k=1)
object_retriever.retrieve("llamaindex")
['llamaindex is an awesome library!']

当涉及到持久化 ObjectIndex 时,我们需要同时处理索引以及对象节点映射。持久化索引相对简单,可以通过常规方式处理(例如,参见本指南)。然而,在持久化 ObjectNodeMapping 时情况则有所不同。由于我们使用 ObjectIndex 对任意Python对象进行索引,可能会出现(或许比我们期望的更频繁)这些任意对象不可序列化的情况。在这些情况下,您可以持久化索引,但用户需要维护一种重建 ObjectNodeMapping 的方法,以便能够重建 ObjectIndex。为方便起见,ObjectIndex 上提供了 persistfrom_persist_dir 方法,它们将分别尝试持久化和加载先前保存的 ObjectIndex

# persist to disk (no path provided will persist to the default path ./storage)
object_index.persist()
# re-loading (no path provided will attempt to load from the default path ./storage)
reloaded_object_index = ObjectIndex.from_persist_dir()
reloaded_object_index._object_node_mapping.obj_node_mapping
{7981070310142320670: {'input': "Hey, how's it going"},
-5984737625581842527: ['a', 'b', 'c', 'd'],
-8305186196625446821: 'llamaindex is an awesome library!'}
object_index._object_node_mapping.obj_node_mapping
{7981070310142320670: {'input': "Hey, how's it going"},
-5984737625581842527: ['a', 'b', 'c', 'd'],
-8305186196625446821: 'llamaindex is an awesome library!'}
from llama_index.core.tools import FunctionTool
from llama_index.core import SummaryIndex
from llama_index.core.objects import SimpleToolNodeMapping
def add(a: int, b: int) -> int:
"""Add two integers and returns the result integer"""
return a + b
def multiply(a: int, b: int) -> int:
"""Multiple two integers and returns the result integer"""
return a * b
multiply_tool = FunctionTool.from_defaults(fn=multiply)
add_tool = FunctionTool.from_defaults(fn=add)
object_mapping = SimpleToolNodeMapping.from_objects([add_tool, multiply_tool])
object_index = ObjectIndex.from_objects(
[add_tool, multiply_tool], object_mapping
)
# trying to persist the object_mapping directly will raise an error
object_mapping.persist()
---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
Cell In[4], line 2
1 # trying to persist the object_mapping directly will raise an error
----> 2 object_mapping.persist()
File ~/Projects/llama_index/llama_index/objects/tool_node_mapping.py:47, in BaseToolNodeMapping.persist(self, persist_dir, obj_node_mapping_fname)
43 def persist(
44 self, persist_dir: str = ..., obj_node_mapping_fname: str = ...
45 ) -> None:
46 """Persist objs."""
---> 47 raise NotImplementedError("Subclasses should implement this!")
NotImplementedError: Subclasses should implement this!
# try to persist the object index here will throw a Warning to the user
object_index.persist()
/var/folders/0g/wd11bmkd791fz7hvgy1kqyp00000gn/T/ipykernel_77363/46708458.py:2: UserWarning: Unable to persist ObjectNodeMapping. You will need to reconstruct the same object node mapping to build this ObjectIndex
object_index.persist()

在这种情况下,只有索引被持久化。 为了重新构建如上所述的 ObjectIndex,我们需要手动重新构建 ObjectNodeMapping 并将其提供给 ObjectIndex.from_persist_dir 方法。

reloaded_object_index = ObjectIndex.from_persist_dir(
object_node_mapping=object_mapping # without this, an error will be thrown
)