跳转到内容

索引

基础读取器类。

基础读取器 #

基类:EventABC

用于从目录加载数据的实用工具。

workflows/handler.py 中的源代码llama_index/core/readers/base.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
class BaseReader(ABC):  # pragma: no cover
    """Utilities for loading data from a directory."""

    def lazy_load_data(self, *args: Any, **load_kwargs: Any) -> Iterable[Document]:
        """Load data from the input directory lazily."""
        raise NotImplementedError(
            f"{self.__class__.__name__} does not provide lazy_load_data method currently"
        )

    async def alazy_load_data(
        self, *args: Any, **load_kwargs: Any
    ) -> Iterable[Document]:
        """Load data from the input directory lazily."""
        # Threaded async - just calls the sync method with to_thread. Override in subclasses for real async implementations.
        return await asyncio.to_thread(self.lazy_load_data, *args, **load_kwargs)

    def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]:
        """Load data from the input directory."""
        return list(self.lazy_load_data(*args, **load_kwargs))

    async def aload_data(self, *args: Any, **load_kwargs: Any) -> List[Document]:
        """Load data from the input directory."""
        return await asyncio.to_thread(self.load_data, *args, **load_kwargs)

    def load_langchain_documents(self, **load_kwargs: Any) -> List["LCDocument"]:
        """Load data in LangChain document format."""
        docs = self.load_data(**load_kwargs)
        return [d.to_langchain_format() for d in docs]

lazy_load_data #

lazy_load_data(*args: Any, **load_kwargs: Any) -> Iterable[文档]

从输入目录中惰性加载数据。

workflows/handler.py 中的源代码llama_index/core/readers/base.py
22
23
24
25
26
def lazy_load_data(self, *args: Any, **load_kwargs: Any) -> Iterable[Document]:
    """Load data from the input directory lazily."""
    raise NotImplementedError(
        f"{self.__class__.__name__} does not provide lazy_load_data method currently"
    )

alazy_load_data async #

alazy_load_data(*args: Any, **load_kwargs: Any) -> Iterable[文档]

从输入目录中惰性加载数据。

workflows/handler.py 中的源代码llama_index/core/readers/base.py
28
29
30
31
32
33
async def alazy_load_data(
    self, *args: Any, **load_kwargs: Any
) -> Iterable[Document]:
    """Load data from the input directory lazily."""
    # Threaded async - just calls the sync method with to_thread. Override in subclasses for real async implementations.
    return await asyncio.to_thread(self.lazy_load_data, *args, **load_kwargs)

load_data #

load_data(*args: Any, **load_kwargs: Any) -> List[文档]

从输入目录加载数据。

workflows/handler.py 中的源代码llama_index/core/readers/base.py
35
36
37
def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]:
    """Load data from the input directory."""
    return list(self.lazy_load_data(*args, **load_kwargs))

aload_data async #

aload_data(*args: Any, **load_kwargs: Any) -> List[文档]

从输入目录加载数据。

workflows/handler.py 中的源代码llama_index/core/readers/base.py
39
40
41
async def aload_data(self, *args: Any, **load_kwargs: Any) -> List[Document]:
    """Load data from the input directory."""
    return await asyncio.to_thread(self.load_data, *args, **load_kwargs)

load_langchain_documents #

load_langchain_documents(**load_kwargs: Any) -> List[Document]

以LangChain文档格式加载数据。

workflows/handler.py 中的源代码llama_index/core/readers/base.py
43
44
45
46
def load_langchain_documents(self, **load_kwargs: Any) -> List["LCDocument"]:
    """Load data in LangChain document format."""
    docs = self.load_data(**load_kwargs)
    return [d.to_langchain_format() for d in docs]

基础Pydantic读取器 #

Bases: BaseReader, BaseComponent

使用Pydantic的可序列化数据加载器。

参数:

名称 类型 描述 默认
is_remote bool

数据是从远程API加载还是从本地文件加载。

False
workflows/handler.py 中的源代码llama_index/core/readers/base.py
49
50
51
52
53
54
55
56
class BasePydanticReader(BaseReader, BaseComponent):
    """Serialiable Data Loader with Pydantic."""

    model_config = ConfigDict(arbitrary_types_allowed=True)
    is_remote: bool = Field(
        default=False,
        description="Whether the data is loaded from a remote API or a local file.",
    )

资源读取器混入 #

基类:EventABC

为提供访问不同类型资源的读取器设计的混入类。

资源指的是可以被读取器访问的特定数据实体。 资源示例包括文件系统读取器的文件、Slack读取器的频道ID,或Notion读取器的页面。

workflows/handler.py 中的源代码llama_index/core/readers/base.py
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
class ResourcesReaderMixin(ABC):  # pragma: no cover
    """
    Mixin for readers that provide access to different types of resources.

    Resources refer to specific data entities that can be accessed by the reader.
    Examples of resources include files for a filesystem reader, channel IDs for a Slack reader, or pages for a Notion reader.
    """

    @abstractmethod
    def list_resources(self, *args: Any, **kwargs: Any) -> List[str]:
        """
        List of identifiers for the specific type of resources available in the reader.

        Returns:
            List[str]: List of identifiers for the specific type of resources available in the reader.

        """

    async def alist_resources(self, *args: Any, **kwargs: Any) -> List[str]:
        """
        List of identifiers for the specific type of resources available in the reader asynchronously.

        Returns:
            List[str]: A list of resources based on the reader type, such as files for a filesystem reader,
            channel IDs for a Slack reader, or pages for a Notion reader.

        """
        return await asyncio.to_thread(self.list_resources, *args, **kwargs)

    def get_permission_info(self, resource_id: str, *args: Any, **kwargs: Any) -> Dict:
        """
        Get a dictionary of information about the permissions of a specific resource.
        """
        raise NotImplementedError(
            f"{self.__class__.__name__} does not provide get_permission_info method currently"
        )

    async def aget_permission_info(
        self, resource_id: str, *args: Any, **kwargs: Any
    ) -> Dict:
        """
        Get a dictionary of information about the permissions of a specific resource asynchronously.
        """
        return await asyncio.to_thread(
            self.get_permission_info, resource_id, *args, **kwargs
        )

    @abstractmethod
    def get_resource_info(self, resource_id: str, *args: Any, **kwargs: Any) -> Dict:
        """
        Get a dictionary of information about a specific resource.

        Args:
            resource (str): The resource identifier.

        Returns:
            Dict: A dictionary of information about the resource.

        """

    async def aget_resource_info(
        self, resource_id: str, *args: Any, **kwargs: Any
    ) -> Dict:
        """
        Get a dictionary of information about a specific resource asynchronously.

        Args:
            resource (str): The resource identifier.

        Returns:
            Dict: A dictionary of information about the resource.

        """
        return await asyncio.to_thread(
            self.get_resource_info, resource_id, *args, **kwargs
        )

    def list_resources_with_info(self, *args: Any, **kwargs: Any) -> Dict[str, Dict]:
        """
        Get a dictionary of information about all resources.

        Returns:
            Dict[str, Dict]: A dictionary of information about all resources.

        """
        return {
            resource: self.get_resource_info(resource, *args, **kwargs)
            for resource in self.list_resources(*args, **kwargs)
        }

    async def alist_resources_with_info(
        self, *args: Any, **kwargs: Any
    ) -> Dict[str, Dict]:
        """
        Get a dictionary of information about all resources asynchronously.

        Returns:
            Dict[str, Dict]: A dictionary of information about all resources.

        """
        return {
            resource: await self.aget_resource_info(resource, *args, **kwargs)
            for resource in await self.alist_resources(*args, **kwargs)
        }

    @abstractmethod
    def load_resource(
        self, resource_id: str, *args: Any, **kwargs: Any
    ) -> List[Document]:
        """
        Load data from a specific resource.

        Args:
            resource (str): The resource identifier.

        Returns:
            List[Document]: A list of documents loaded from the resource.

        """

    async def aload_resource(
        self, resource_id: str, *args: Any, **kwargs: Any
    ) -> List[Document]:
        """Read file from filesystem and return documents asynchronously."""
        return await asyncio.to_thread(self.load_resource, resource_id, *args, **kwargs)

    def load_resources(
        self, resource_ids: List[str], *args: Any, **kwargs: Any
    ) -> List[Document]:
        """
        Similar to load_data, but only for specific resources.

        Args:
            resource_ids (List[str]): List of resource identifiers.

        Returns:
            List[Document]: A list of documents loaded from the resources.

        """
        return [
            doc
            for resource in resource_ids
            for doc in self.load_resource(resource, *args, **kwargs)
        ]

    async def aload_resources(
        self, resource_ids: List[str], *args: Any, **kwargs: Any
    ) -> Dict[str, List[Document]]:
        """
        Similar ato load_data, but only for specific resources.

        Args:
            resource_ids (List[str]): List of resource identifiers.

        Returns:
            Dict[str, List[Document]]: A dictionary of documents loaded from the resources.

        """
        return {
            resource: await self.aload_resource(resource, *args, **kwargs)
            for resource in resource_ids
        }

list_resources abstractmethod #

list_resources(*args: Any, **kwargs: Any) -> List[str]

阅读器中可用特定类型资源的标识符列表。

返回:

类型 描述
List[str]

List[str]: 读取器中可用特定类型资源的标识符列表。

workflows/handler.py 中的源代码llama_index/core/readers/base.py
67
68
69
70
71
72
73
74
75
@abstractmethod
def list_resources(self, *args: Any, **kwargs: Any) -> List[str]:
    """
    List of identifiers for the specific type of resources available in the reader.

    Returns:
        List[str]: List of identifiers for the specific type of resources available in the reader.

    """

alist_resources async #

alist_resources(*args: Any, **kwargs: Any) -> List[str]

读取器中可异步获取的特定类型资源标识符列表。

返回:

类型 描述
List[str]

List[str]: 根据读取器类型返回的资源列表,例如文件系统读取器的文件,

List[str]

Slack 读取器的频道 ID,或 Notion 读取器的页面。

workflows/handler.py 中的源代码llama_index/core/readers/base.py
77
78
79
80
81
82
83
84
85
86
async def alist_resources(self, *args: Any, **kwargs: Any) -> List[str]:
    """
    List of identifiers for the specific type of resources available in the reader asynchronously.

    Returns:
        List[str]: A list of resources based on the reader type, such as files for a filesystem reader,
        channel IDs for a Slack reader, or pages for a Notion reader.

    """
    return await asyncio.to_thread(self.list_resources, *args, **kwargs)

get_permission_info #

get_permission_info(resource_id: str, *args: Any, **kwargs: Any) -> Dict

获取关于特定资源权限信息的字典。

workflows/handler.py 中的源代码llama_index/core/readers/base.py
88
89
90
91
92
93
94
def get_permission_info(self, resource_id: str, *args: Any, **kwargs: Any) -> Dict:
    """
    Get a dictionary of information about the permissions of a specific resource.
    """
    raise NotImplementedError(
        f"{self.__class__.__name__} does not provide get_permission_info method currently"
    )

aget_permission_info async #

aget_permission_info(resource_id: str, *args: Any, **kwargs: Any) -> Dict

异步获取特定资源权限信息的字典。

workflows/handler.py 中的源代码llama_index/core/readers/base.py
 96
 97
 98
 99
100
101
102
103
104
async def aget_permission_info(
    self, resource_id: str, *args: Any, **kwargs: Any
) -> Dict:
    """
    Get a dictionary of information about the permissions of a specific resource asynchronously.
    """
    return await asyncio.to_thread(
        self.get_permission_info, resource_id, *args, **kwargs
    )

get_resource_info abstractmethod #

get_resource_info(resource_id: str, *args: Any, **kwargs: Any) -> Dict

获取关于特定资源的信息字典。

参数:

名称 类型 描述 默认
resource str

资源标识符。

required

返回:

名称 类型 描述
Dict Dict

关于该资源的信息字典。

workflows/handler.py 中的源代码llama_index/core/readers/base.py
106
107
108
109
110
111
112
113
114
115
116
117
@abstractmethod
def get_resource_info(self, resource_id: str, *args: Any, **kwargs: Any) -> Dict:
    """
    Get a dictionary of information about a specific resource.

    Args:
        resource (str): The resource identifier.

    Returns:
        Dict: A dictionary of information about the resource.

    """

aget_resource_info async #

aget_resource_info(resource_id: str, *args: Any, **kwargs: Any) -> Dict

异步获取特定资源的信息字典。

参数:

名称 类型 描述 默认
resource str

资源标识符。

required

返回:

名称 类型 描述
Dict Dict

关于该资源的信息字典。

workflows/handler.py 中的源代码llama_index/core/readers/base.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
async def aget_resource_info(
    self, resource_id: str, *args: Any, **kwargs: Any
) -> Dict:
    """
    Get a dictionary of information about a specific resource asynchronously.

    Args:
        resource (str): The resource identifier.

    Returns:
        Dict: A dictionary of information about the resource.

    """
    return await asyncio.to_thread(
        self.get_resource_info, resource_id, *args, **kwargs
    )

list_resources_with_info #

list_resources_with_info(*args: Any, **kwargs: Any) -> Dict[str, Dict]

获取所有资源信息的字典。

返回:

类型 描述
Dict[str, Dict]

Dict[str, Dict]: 包含所有资源信息的字典。

workflows/handler.py 中的源代码llama_index/core/readers/base.py
136
137
138
139
140
141
142
143
144
145
146
147
def list_resources_with_info(self, *args: Any, **kwargs: Any) -> Dict[str, Dict]:
    """
    Get a dictionary of information about all resources.

    Returns:
        Dict[str, Dict]: A dictionary of information about all resources.

    """
    return {
        resource: self.get_resource_info(resource, *args, **kwargs)
        for resource in self.list_resources(*args, **kwargs)
    }

alist_resources_with_info async #

alist_resources_with_info(*args: Any, **kwargs: Any) -> Dict[str, Dict]

异步获取所有资源信息的字典。

返回:

类型 描述
Dict[str, Dict]

Dict[str, Dict]: 包含所有资源信息的字典。

workflows/handler.py 中的源代码llama_index/core/readers/base.py
149
150
151
152
153
154
155
156
157
158
159
160
161
162
async def alist_resources_with_info(
    self, *args: Any, **kwargs: Any
) -> Dict[str, Dict]:
    """
    Get a dictionary of information about all resources asynchronously.

    Returns:
        Dict[str, Dict]: A dictionary of information about all resources.

    """
    return {
        resource: await self.aget_resource_info(resource, *args, **kwargs)
        for resource in await self.alist_resources(*args, **kwargs)
    }

load_resource abstractmethod #

load_resource(resource_id: str, *args: Any, **kwargs: Any) -> List[文档]

从特定资源加载数据。

参数:

名称 类型 描述 默认
resource str

资源标识符。

required

返回:

类型 描述
List[文档]

List[Document]: 从资源加载的文档列表。

workflows/handler.py 中的源代码llama_index/core/readers/base.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
@abstractmethod
def load_resource(
    self, resource_id: str, *args: Any, **kwargs: Any
) -> List[Document]:
    """
    Load data from a specific resource.

    Args:
        resource (str): The resource identifier.

    Returns:
        List[Document]: A list of documents loaded from the resource.

    """

aload_resource async #

aload_resource(resource_id: str, *args: Any, **kwargs: Any) -> List[文档]

从文件系统中读取文件并异步返回文档。

workflows/handler.py 中的源代码llama_index/core/readers/base.py
179
180
181
182
183
async def aload_resource(
    self, resource_id: str, *args: Any, **kwargs: Any
) -> List[Document]:
    """Read file from filesystem and return documents asynchronously."""
    return await asyncio.to_thread(self.load_resource, resource_id, *args, **kwargs)

load_resources #

load_resources(resource_ids: List[str], *args: Any, **kwargs: Any) -> List[文档]

类似于 load_data,但仅针对特定资源。

参数:

名称 类型 描述 默认
resource_ids List[str]

资源标识符列表。

required

返回:

类型 描述
List[文档]

List[Document]: 从资源加载的文档列表。

workflows/handler.py 中的源代码llama_index/core/readers/base.py
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
def load_resources(
    self, resource_ids: List[str], *args: Any, **kwargs: Any
) -> List[Document]:
    """
    Similar to load_data, but only for specific resources.

    Args:
        resource_ids (List[str]): List of resource identifiers.

    Returns:
        List[Document]: A list of documents loaded from the resources.

    """
    return [
        doc
        for resource in resource_ids
        for doc in self.load_resource(resource, *args, **kwargs)
    ]

aload_resources async #

aload_resources(resource_ids: List[str], *args: Any, **kwargs: Any) -> Dict[str, List[文档]]

类似于 load_data,但仅针对特定资源。

参数:

名称 类型 描述 默认
resource_ids List[str]

资源标识符列表。

required

返回:

类型 描述
Dict[str, List[文档]]

Dict[str, List[Document]]: 从资源加载的文档字典。

workflows/handler.py 中的源代码llama_index/core/readers/base.py
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
async def aload_resources(
    self, resource_ids: List[str], *args: Any, **kwargs: Any
) -> Dict[str, List[Document]]:
    """
    Similar ato load_data, but only for specific resources.

    Args:
        resource_ids (List[str]): List of resource identifiers.

    Returns:
        Dict[str, List[Document]]: A dictionary of documents loaded from the resources.

    """
    return {
        resource: await self.aload_resource(resource, *args, **kwargs)
        for resource in resource_ids
    }

读取器配置 #

基类:EventBaseComponent

表示一个读取器及其输入参数。

参数:

名称 类型 描述 默认
reader BasePydanticReader

要使用的读取器。

required
reader_args List[Any]

读取器参数。

<dynamic>
workflows/handler.py 中的源代码llama_index/core/readers/base.py
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
class ReaderConfig(BaseComponent):  # pragma: no cover
    """Represents a reader and it's input arguments."""

    model_config = ConfigDict(arbitrary_types_allowed=True)
    reader: BasePydanticReader = Field(..., description="Reader to use.")
    reader_args: List[Any] = Field(default_factory=list, description="Reader args.")
    reader_kwargs: Dict[str, Any] = Field(
        default_factory=dict, description="Reader kwargs."
    )

    @classmethod
    def class_name(cls) -> str:
        """Get the name identifier of the class."""
        return "ReaderConfig"

    def to_dict(self, **kwargs: Any) -> Dict[str, Any]:
        """Convert the class to a dictionary."""
        return {
            "loader": self.reader.to_dict(**kwargs),
            "reader_args": self.reader_args,
            "reader_kwargs": self.reader_kwargs,
            "class_name": self.class_name(),
        }

    def read(self) -> List[Document]:
        """Call the loader with the given arguments."""
        return self.reader.load_data(*self.reader_args, **self.reader_kwargs)

class_name classmethod #

class_name() -> str

获取类的名称标识符。

workflows/handler.py 中的源代码llama_index/core/readers/base.py
233
234
235
236
@classmethod
def class_name(cls) -> str:
    """Get the name identifier of the class."""
    return "ReaderConfig"

to_dict #

to_dict(**kwargs: Any) -> Dict[str, Any]

将类转换为字典。

workflows/handler.py 中的源代码llama_index/core/readers/base.py
238
239
240
241
242
243
244
245
def to_dict(self, **kwargs: Any) -> Dict[str, Any]:
    """Convert the class to a dictionary."""
    return {
        "loader": self.reader.to_dict(**kwargs),
        "reader_args": self.reader_args,
        "reader_kwargs": self.reader_kwargs,
        "class_name": self.class_name(),
    }

读取 #

read() -> List[文档]

使用给定的参数调用加载器。

workflows/handler.py 中的源代码llama_index/core/readers/base.py
247
248
249
def read(self) -> List[Document]:
    """Call the loader with the given arguments."""
    return self.reader.load_data(*self.reader_args, **self.reader_kwargs)

选项: 成员:- BaseReader - BasePydanticReader