基类:EventBasePydanticReader
Docling 阅读器。
将PDF、DOCX及其他文档格式提取为LlamaIndex文档,支持Markdown格式或JSON序列化的Docling原生格式。
参数:
| 名称 |
类型 |
描述 |
默认 |
export_type
|
Literal[markdown, json]
|
|
required
|
doc_converter
|
DocumentConverter
|
要使用的Docling转换器。默认工厂:DocumentConverter。
|
required
|
md_export_kwargs
|
Dict[str, Any]
|
用于Markdown导出时的参数。默认为 {"image_placeholder": ""}。
|
required
|
id_func
|
|
(DocIDGenCallable, 可选): 要使用的文档ID生成函数。默认值: _uuid4_doc_id_gen
|
required
|
workflows/handler.py 中的源代码llama_index/readers/docling/base.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83 | class DoclingReader(BasePydanticReader):
"""
Docling Reader.
Extracts PDF, DOCX, and other document formats into LlamaIndex Documents as either Markdown or JSON-serialized Docling native format.
Args:
export_type (Literal["markdown", "json"], optional): The type to export to. Defaults to "markdown".
doc_converter (DocumentConverter, optional): The Docling converter to use. Default factory: `DocumentConverter`.
md_export_kwargs (Dict[str, Any], optional): Kwargs to use in case of markdown export. Defaults to `{"image_placeholder": ""}`.
id_func: (DocIDGenCallable, optional): Doc ID generation function to use. Default: `_uuid4_doc_id_gen`
"""
class ExportType(str, Enum):
MARKDOWN = "markdown"
JSON = "json"
@runtime_checkable
class DocIDGenCallable(Protocol):
def __call__(self, doc: DLDocument, file_path: str | Path) -> str: ...
@staticmethod
def _uuid4_doc_id_gen(doc: DLDocument, file_path: str | Path) -> str:
return str(uuid.uuid4())
export_type: ExportType = ExportType.MARKDOWN
doc_converter: DocumentConverter = Field(default_factory=DocumentConverter)
md_export_kwargs: Dict[str, Any] = {"image_placeholder": ""}
id_func: DocIDGenCallable = _uuid4_doc_id_gen
def lazy_load_data(
self,
file_path: str | Path | Iterable[str] | Iterable[Path],
extra_info: dict | None = None,
fs: Optional[AbstractFileSystem] = None,
) -> Iterable[LIDocument]:
"""
Lazily load from given source.
Args:
file_path (str | Path | Iterable[str] | Iterable[Path]): Document file source as single str (URL or local file) or pathlib.Path — or iterable thereof
extra_info (dict | None, optional): Any pre-existing metadata to include. Defaults to None.
Returns:
Iterable[LIDocument]: Iterable over the created LlamaIndex documents.
"""
file_paths = (
file_path
if isinstance(file_path, Iterable) and not isinstance(file_path, str)
else [file_path]
)
for source in file_paths:
dl_doc = self.doc_converter.convert(source).document
text: str
if self.export_type == self.ExportType.MARKDOWN:
text = dl_doc.export_to_markdown(**self.md_export_kwargs)
elif self.export_type == self.ExportType.JSON:
text = json.dumps(dl_doc.export_to_dict())
else:
raise ValueError(f"Unexpected export type: {self.export_type}")
li_doc = LIDocument(
doc_id=self.id_func(doc=dl_doc, file_path=source),
text=text,
)
li_doc.metadata = extra_info or {}
yield li_doc
|
lazy_load_data
lazy_load_data(file_path: str | Path | Iterable[str] | Iterable[Path], extra_info: dict | None = None, fs: Optional[AbstractFileSystem] = None) -> Iterable[文档]
从给定源延迟加载。
参数:
| 名称 |
类型 |
描述 |
默认 |
file_path
|
str | Path | Iterable[str] | Iterable[Path]
|
文档文件源作为单个字符串(URL或本地文件)或 pathlib.Path — 或其可迭代对象
|
required
|
extra_info
|
dict | None
|
|
None
|
返回:
| 类型 |
描述 |
Iterable[文档]
|
Iterable[LIDocument]: 可迭代的已创建的LlamaIndex文档。
|
workflows/handler.py 中的源代码llama_index/readers/docling/base.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83 | def lazy_load_data(
self,
file_path: str | Path | Iterable[str] | Iterable[Path],
extra_info: dict | None = None,
fs: Optional[AbstractFileSystem] = None,
) -> Iterable[LIDocument]:
"""
Lazily load from given source.
Args:
file_path (str | Path | Iterable[str] | Iterable[Path]): Document file source as single str (URL or local file) or pathlib.Path — or iterable thereof
extra_info (dict | None, optional): Any pre-existing metadata to include. Defaults to None.
Returns:
Iterable[LIDocument]: Iterable over the created LlamaIndex documents.
"""
file_paths = (
file_path
if isinstance(file_path, Iterable) and not isinstance(file_path, str)
else [file_path]
)
for source in file_paths:
dl_doc = self.doc_converter.convert(source).document
text: str
if self.export_type == self.ExportType.MARKDOWN:
text = dl_doc.export_to_markdown(**self.md_export_kwargs)
elif self.export_type == self.ExportType.JSON:
text = json.dumps(dl_doc.export_to_dict())
else:
raise ValueError(f"Unexpected export type: {self.export_type}")
li_doc = LIDocument(
doc_id=self.id_func(doc=dl_doc, file_path=source),
text=text,
)
li_doc.metadata = extra_info or {}
yield li_doc
|