基类: BaseReader
简单的Gitbook阅读器。
将每个gitbook页面转换为LlamaIndex使用的文档。
参数:
| 名称 |
类型 |
描述 |
默认值 |
api_token
|
str
|
|
required
|
api_url
|
str
|
|
None
|
Source code in llama-index-integrations/readers/llama-index-readers-gitbook/llama_index/readers/gitbook/base.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85 | class SimpleGitbookReader(BaseReader):
"""
Simple gitbook reader.
Convert each gitbook page into Document used by LlamaIndex.
Args:
api_token (str): Gitbook API Token.
api_url (str): Gitbook API Endpoint.
"""
def __init__(self, api_token: str, api_url: str = None) -> None:
"""Initialize with parameters."""
self.client = GitbookClient(api_token, api_url)
def load_data(
self,
space_id: str,
metadata_names: Optional[List[str]] = None,
show_progress=False,
) -> List[Document]:
"""
Load data from the input directory.
Args:
space_id (str): Gitbook space id
metadata_names (Optional[List[str]]): names of the fields to be added
to the metadata attribute of the Document.
only 'path', 'title', 'description', 'parent' are available
Defaults to None
show_progress (bool, optional): Show progress bar. Defaults to False
Returns:
List[Document]: A list of documents.
"""
if metadata_names:
invalid_fields = set(metadata_names) - VALID_METADATA_FIELDS
if invalid_fields:
raise ValueError(
f"Invalid metadata fields: {', '.join(invalid_fields)}"
)
documents = []
pages = self.client.list_pages(space_id)
if show_progress:
from tqdm import tqdm
iterator = tqdm(pages, desc="Downloading pages")
else:
iterator = pages
for page in iterator:
id = page.get("id")
content = self.client.get_page_markdown(space_id, id)
if not content:
print(f"Warning: No content found for page ID {id}. Skipping...")
continue
if metadata_names is None:
documents.append(
Document(text=content, id_=id, metadata={"path": page.get("path")})
)
else:
try:
metadata = {name: page.get(name) for name in metadata_names}
except KeyError as err:
raise ValueError(
f"{err.args[0]} field is not available. Choose from {', '.join(VALID_METADATA_FIELDS)}"
) from err
documents.append(Document(text=content, id_=id, metadata=metadata))
return documents
|
加载数据
load_data(space_id: str, metadata_names: Optional[List[str]] = None, show_progress=False) -> List[Document]
从输入目录加载数据。
参数:
| 名称 |
类型 |
描述 |
默认值 |
space_id
|
str
|
|
required
|
metadata_names
|
Optional[List[str]]
|
要添加到文档元数据属性中的字段名称。
仅支持 'path'、'title'、'description'、'parent' 这几个字段
默认为 None
|
None
|
show_progress
|
bool
|
|
False
|
返回:
Source code in llama-index-integrations/readers/llama-index-readers-gitbook/llama_index/readers/gitbook/base.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85 | def load_data(
self,
space_id: str,
metadata_names: Optional[List[str]] = None,
show_progress=False,
) -> List[Document]:
"""
Load data from the input directory.
Args:
space_id (str): Gitbook space id
metadata_names (Optional[List[str]]): names of the fields to be added
to the metadata attribute of the Document.
only 'path', 'title', 'description', 'parent' are available
Defaults to None
show_progress (bool, optional): Show progress bar. Defaults to False
Returns:
List[Document]: A list of documents.
"""
if metadata_names:
invalid_fields = set(metadata_names) - VALID_METADATA_FIELDS
if invalid_fields:
raise ValueError(
f"Invalid metadata fields: {', '.join(invalid_fields)}"
)
documents = []
pages = self.client.list_pages(space_id)
if show_progress:
from tqdm import tqdm
iterator = tqdm(pages, desc="Downloading pages")
else:
iterator = pages
for page in iterator:
id = page.get("id")
content = self.client.get_page_markdown(space_id, id)
if not content:
print(f"Warning: No content found for page ID {id}. Skipping...")
continue
if metadata_names is None:
documents.append(
Document(text=content, id_=id, metadata={"path": page.get("path")})
)
else:
try:
metadata = {name: page.get(name) for name in metadata_names}
except KeyError as err:
raise ValueError(
f"{err.args[0]} field is not available. Choose from {', '.join(VALID_METADATA_FIELDS)}"
) from err
documents.append(Document(text=content, id_=id, metadata=metadata))
return documents
|