跳至内容

文档字符串遍历器

初始化文件。

DocstringWalker #

基类: BaseReader

一个用于提取文档字符串并从中构建结构化文档的加载器。 递归遍历目录并从每个Python模块中提取文档字符串 - 从模块本身开始,然后是类,接着是函数。 构建提取的文档字符串之间的依赖关系图。

Source code in llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
class DocstringWalker(BaseReader):
    """
    A loader for docstring extraction and building structured documents from them.
    Recursively walks a directory and extracts docstrings from each Python
    module - starting from the module itself, then classes, then functions.
    Builds a graph of dependencies between the extracted docstrings.
    """

    def load_data(
        self,
        code_dir: str,
        skip_initpy: bool = True,
        fail_on_malformed_files: bool = False,
    ) -> List[Document]:
        """
        Load data from the specified code directory.
        Additionally, after loading the data, build a dependency graph between the loaded documents.
        The graph is stored as an attribute of the class.


        Parameters
        ----------
        code_dir : str
            The directory path to the code files.
        skip_initpy : bool
            Whether to skip the __init__.py files. Defaults to True.
        fail_on_malformed_files : bool
            Whether to fail on malformed files. Defaults to False - in this case,
            the malformed files are skipped and a warning is logged.

        Returns
        -------
        List[Document]
            A list of loaded documents.

        """
        return self.process_directory(code_dir, skip_initpy, fail_on_malformed_files)

    def process_directory(
        self,
        code_dir: str,
        skip_initpy: bool = True,
        fail_on_malformed_files: bool = False,
    ) -> List[Document]:
        """
        Process a directory and extract information from Python files.

        Parameters
        ----------
        code_dir : str
            The directory path to the code files.
        skip_initpy : bool
            Whether to skip the __init__.py files. Defaults to True.
        fail_on_malformed_files : bool
            Whether to fail on malformed files. Defaults to False - in this case,
            the malformed files are skipped and a warning is logged.

        Returns
        -------
        List[Document]
            A list of Document objects.

        """
        llama_docs = []
        for root, _, files in os.walk(code_dir):
            for file in files:
                if file.endswith(".py"):
                    if skip_initpy and file == "__init__.py":
                        continue
                    module_name = file.replace(".py", "")
                    module_path = os.path.join(root, file)
                    try:
                        doc = self.parse_module(module_name, module_path)
                        llama_docs.append(doc)
                    except Exception as e:
                        if fail_on_malformed_files:
                            raise e  # noqa: TRY201
                        log.warning(
                            "Failed to parse file %s. Skipping. Error: %s",
                            module_path,
                            e,
                        )
                        continue
        return llama_docs

    def read_module_text(self, path: str) -> str:
        """
        Read the text of a Python module. For tests this function can be mocked.

        Parameters
        ----------
        path : str
            Path to the module.

        Returns
        -------
        str
            The text of the module.

        """
        with open(path, encoding="utf-8") as f:
            return f.read()

    def parse_module(self, module_name: str, path: str) -> Document:
        """
        Function for parsing a single Python module.

        Parameters
        ----------
        module_name : str
            A module name.
        path : str
            Path to the module.

        Returns
        -------
        Document
            A LLama Index Document object with extracted information from the module.

        """
        module_text = self.read_module_text(path)
        module = ast.parse(module_text)
        module_docstring = ast.get_docstring(module)
        module_text = f"Module name: {module_name} \n Docstring: {module_docstring} \n"
        sub_texts = []
        for elem in module.body:
            if type(elem) in TYPES_TO_PROCESS:
                sub_text = self.process_elem(elem, module_name)
                sub_texts.append(sub_text)
        module_text += "\n".join(sub_texts)
        return Document(text=module_text)

    def process_class(self, class_node: ast.ClassDef, parent_node: str):
        """
        Process a class node in the AST and add relevant information to the graph.

        Parameters
        ----------
        class_node : ast.ClassDef
            The class node to process. It represents a class definition
            in the abstract syntax tree (AST).
        parent_node : str
            The name of the parent node. It specifies the name of the parent node in the graph.

        Returns
        -------
        str
            A string representation of the processed class node and its sub-elements.
            It provides a textual representation of the processed class node and its sub-elements.

        """
        cls_name = class_node.name
        cls_docstring = ast.get_docstring(class_node)

        text = f"\n Class name: {cls_name}, In: {parent_node} \n Docstring: {cls_docstring}"
        sub_texts = []
        for elem in class_node.body:
            sub_text = self.process_elem(elem, cls_name)
            sub_texts.append(sub_text)
        return text + "\n".join(sub_texts)

    def process_function(self, func_node: ast.FunctionDef, parent_node: str) -> str:
        """
        Process a function node in the AST and add it to the graph. Build node text.

        Parameters
        ----------
        func_node : ast.FunctionDef
            The function node to process.
        parent_node : str
            The name of the parent node.

        Returns
        -------
        str
            A string representation of the processed function node with its sub-elements.

        """
        func_name = func_node.name
        func_docstring = ast.get_docstring(func_node)

        text = f"\n Function name: {func_name}, In: {parent_node} \n Docstring: {func_docstring}"
        sub_texts = []
        for elem in func_node.body:
            sub_text = self.process_elem(elem, func_name)
            sub_texts.append(sub_text)
        return text + "\n".join(sub_texts)

    def process_elem(self, elem, parent_node: str) -> str:
        """
        Process an element in the abstract syntax tree (AST).

        This is a generic function that delegates the execution to more specific
        functions based on the type of the element.

        Args:
            elem (ast.AST): The element to process.
            parent_node (str): The parent node in the graph.
            graph (nx.Graph): The graph to update.

        Returns:
            str: The result of processing the element.

        """
        if isinstance(elem, ast.FunctionDef):
            return self.process_function(elem, parent_node)
        elif isinstance(elem, ast.ClassDef):
            return self.process_class(elem, parent_node)
        return ""

加载数据 #

load_data(code_dir: str, skip_initpy: bool = True, fail_on_malformed_files: bool = False) -> List[Document]

从指定的代码目录加载数据。 此外,在加载数据后,构建已加载文档之间的依赖关系图。 该图将作为类的属性存储。

参数#

code_dir : str 代码文件的目录路径。 skip_initpy : bool 是否跳过init.py文件。默认为True。 fail_on_malformed_files : bool 是否在遇到格式错误的文件时报错。默认为False - 这种情况下, 格式错误的文件会被跳过并记录警告。

返回#

List[Document] 已加载文档的列表。

Source code in llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def load_data(
    self,
    code_dir: str,
    skip_initpy: bool = True,
    fail_on_malformed_files: bool = False,
) -> List[Document]:
    """
    Load data from the specified code directory.
    Additionally, after loading the data, build a dependency graph between the loaded documents.
    The graph is stored as an attribute of the class.


    Parameters
    ----------
    code_dir : str
        The directory path to the code files.
    skip_initpy : bool
        Whether to skip the __init__.py files. Defaults to True.
    fail_on_malformed_files : bool
        Whether to fail on malformed files. Defaults to False - in this case,
        the malformed files are skipped and a warning is logged.

    Returns
    -------
    List[Document]
        A list of loaded documents.

    """
    return self.process_directory(code_dir, skip_initpy, fail_on_malformed_files)

process_directory #

process_directory(code_dir: str, skip_initpy: bool = True, fail_on_malformed_files: bool = False) -> List[Document]

处理一个目录并从Python文件中提取信息。

参数#

code_dir : str 代码文件的目录路径。 skip_initpy : bool 是否跳过init.py文件。默认为True。 fail_on_malformed_files : bool 是否在遇到格式错误的文件时报错。默认为False - 这种情况下, 格式错误的文件会被跳过并记录警告。

返回#

List[Document] 一个Document对象的列表。

Source code in llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def process_directory(
    self,
    code_dir: str,
    skip_initpy: bool = True,
    fail_on_malformed_files: bool = False,
) -> List[Document]:
    """
    Process a directory and extract information from Python files.

    Parameters
    ----------
    code_dir : str
        The directory path to the code files.
    skip_initpy : bool
        Whether to skip the __init__.py files. Defaults to True.
    fail_on_malformed_files : bool
        Whether to fail on malformed files. Defaults to False - in this case,
        the malformed files are skipped and a warning is logged.

    Returns
    -------
    List[Document]
        A list of Document objects.

    """
    llama_docs = []
    for root, _, files in os.walk(code_dir):
        for file in files:
            if file.endswith(".py"):
                if skip_initpy and file == "__init__.py":
                    continue
                module_name = file.replace(".py", "")
                module_path = os.path.join(root, file)
                try:
                    doc = self.parse_module(module_name, module_path)
                    llama_docs.append(doc)
                except Exception as e:
                    if fail_on_malformed_files:
                        raise e  # noqa: TRY201
                    log.warning(
                        "Failed to parse file %s. Skipping. Error: %s",
                        module_path,
                        e,
                    )
                    continue
    return llama_docs

read_module_text #

read_module_text(path: str) -> str

读取Python模块的文本内容。在测试时,可以对该函数进行模拟。

参数#

路径 : str 模块的路径。

返回#

str 模块的文本内容。

Source code in llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def read_module_text(self, path: str) -> str:
    """
    Read the text of a Python module. For tests this function can be mocked.

    Parameters
    ----------
    path : str
        Path to the module.

    Returns
    -------
    str
        The text of the module.

    """
    with open(path, encoding="utf-8") as f:
        return f.read()

parse_module #

parse_module(module_name: str, path: str) -> Document

用于解析单个Python模块的函数。

参数#

module_name : str 模块名称。 path : str 模块路径。

返回#

文档 一个LLama Index文档对象,包含从模块中提取的信息。

Source code in llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
def parse_module(self, module_name: str, path: str) -> Document:
    """
    Function for parsing a single Python module.

    Parameters
    ----------
    module_name : str
        A module name.
    path : str
        Path to the module.

    Returns
    -------
    Document
        A LLama Index Document object with extracted information from the module.

    """
    module_text = self.read_module_text(path)
    module = ast.parse(module_text)
    module_docstring = ast.get_docstring(module)
    module_text = f"Module name: {module_name} \n Docstring: {module_docstring} \n"
    sub_texts = []
    for elem in module.body:
        if type(elem) in TYPES_TO_PROCESS:
            sub_text = self.process_elem(elem, module_name)
            sub_texts.append(sub_text)
    module_text += "\n".join(sub_texts)
    return Document(text=module_text)

process_class #

process_class(class_node: ClassDef, parent_node: str)

处理AST中的类节点,并将相关信息添加到图中。

参数#

class_node : ast.ClassDef 要处理的类节点。它表示抽象语法树(AST)中的一个类定义。 parent_node : str 父节点名称。它指定了图中父节点的名称。

返回#

str 处理后的类节点及其子元素的字符串表示形式。 它提供了处理后的类节点及其子元素的文本表示形式。

Source code in llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
def process_class(self, class_node: ast.ClassDef, parent_node: str):
    """
    Process a class node in the AST and add relevant information to the graph.

    Parameters
    ----------
    class_node : ast.ClassDef
        The class node to process. It represents a class definition
        in the abstract syntax tree (AST).
    parent_node : str
        The name of the parent node. It specifies the name of the parent node in the graph.

    Returns
    -------
    str
        A string representation of the processed class node and its sub-elements.
        It provides a textual representation of the processed class node and its sub-elements.

    """
    cls_name = class_node.name
    cls_docstring = ast.get_docstring(class_node)

    text = f"\n Class name: {cls_name}, In: {parent_node} \n Docstring: {cls_docstring}"
    sub_texts = []
    for elem in class_node.body:
        sub_text = self.process_elem(elem, cls_name)
        sub_texts.append(sub_text)
    return text + "\n".join(sub_texts)

process_function #

process_function(func_node: FunctionDef, parent_node: str) -> str

处理AST中的函数节点并将其添加到图中。构建节点文本。

参数#

func_node : ast.FunctionDef 要处理的函数节点。 parent_node : str 父节点的名称。

返回#

str 一个表示处理后的函数节点及其子元素的字符串。

Source code in llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
def process_function(self, func_node: ast.FunctionDef, parent_node: str) -> str:
    """
    Process a function node in the AST and add it to the graph. Build node text.

    Parameters
    ----------
    func_node : ast.FunctionDef
        The function node to process.
    parent_node : str
        The name of the parent node.

    Returns
    -------
    str
        A string representation of the processed function node with its sub-elements.

    """
    func_name = func_node.name
    func_docstring = ast.get_docstring(func_node)

    text = f"\n Function name: {func_name}, In: {parent_node} \n Docstring: {func_docstring}"
    sub_texts = []
    for elem in func_node.body:
        sub_text = self.process_elem(elem, func_name)
        sub_texts.append(sub_text)
    return text + "\n".join(sub_texts)

process_elem #

process_elem(elem, parent_node: str) -> str

处理抽象语法树(AST)中的一个元素。

这是一个通用函数,根据元素的类型将执行委托给更具体的函数。

参数:

名称 类型 描述 默认值
elem AST

要处理的元素。

required
parent_node str

图中的父节点。

required
graph Graph

要更新的图表。

required

返回:

名称 类型 描述
str str

元素处理的结果。

Source code in llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
def process_elem(self, elem, parent_node: str) -> str:
    """
    Process an element in the abstract syntax tree (AST).

    This is a generic function that delegates the execution to more specific
    functions based on the type of the element.

    Args:
        elem (ast.AST): The element to process.
        parent_node (str): The parent node in the graph.
        graph (nx.Graph): The graph to update.

    Returns:
        str: The result of processing the element.

    """
    if isinstance(elem, ast.FunctionDef):
        return self.process_function(elem, parent_node)
    elif isinstance(elem, ast.ClassDef):
        return self.process_class(elem, parent_node)
    return ""
优云智算