文档字符串遍历器

初始化文件。

DocstringWalker #

基类: BaseReader

一个用于提取文档字符串并从中构建结构化文档的加载器。递归遍历目录并从每个Python模块中提取文档字符串 - 从模块本身开始，然后是类，接着是函数。构建提取的文档字符串之间的依赖关系图。

Source code in llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py

class DocstringWalker(BaseReader):
    """
    A loader for docstring extraction and building structured documents from them.
    Recursively walks a directory and extracts docstrings from each Python
    module - starting from the module itself, then classes, then functions.
    Builds a graph of dependencies between the extracted docstrings.
    """

    def load_data(
        self,
        code_dir: str,
        skip_initpy: bool = True,
        fail_on_malformed_files: bool = False,
    ) -> List[Document]:
        """
        Load data from the specified code directory.
        Additionally, after loading the data, build a dependency graph between the loaded documents.
        The graph is stored as an attribute of the class.


        Parameters
        ----------
        code_dir : str
            The directory path to the code files.
        skip_initpy : bool
            Whether to skip the __init__.py files. Defaults to True.
        fail_on_malformed_files : bool
            Whether to fail on malformed files. Defaults to False - in this case,
            the malformed files are skipped and a warning is logged.

        Returns
        -------
        List[Document]
            A list of loaded documents.

        """
        return self.process_directory(code_dir, skip_initpy, fail_on_malformed_files)

    def process_directory(
        self,
        code_dir: str,
        skip_initpy: bool = True,
        fail_on_malformed_files: bool = False,
    ) -> List[Document]:
        """
        Process a directory and extract information from Python files.

        Parameters
        ----------
        code_dir : str
            The directory path to the code files.
        skip_initpy : bool
            Whether to skip the __init__.py files. Defaults to True.
        fail_on_malformed_files : bool
            Whether to fail on malformed files. Defaults to False - in this case,
            the malformed files are skipped and a warning is logged.

        Returns
        -------
        List[Document]
            A list of Document objects.

        """
        llama_docs = []
        for root, _, files in os.walk(code_dir):
            for file in files:
                if file.endswith(".py"):
                    if skip_initpy and file == "__init__.py":
                        continue
                    module_name = file.replace(".py", "")
                    module_path = os.path.join(root, file)
                    try:
                        doc = self.parse_module(module_name, module_path)
                        llama_docs.append(doc)
                    except Exception as e:
                        if fail_on_malformed_files:
                            raise e  # noqa: TRY201
                        log.warning(
                            "Failed to parse file %s. Skipping. Error: %s",
                            module_path,
                            e,
                        )
                        continue
        return llama_docs

    def read_module_text(self, path: str) -> str:
        """
        Read the text of a Python module. For tests this function can be mocked.

        Parameters
        ----------
        path : str
            Path to the module.

        Returns
        -------
        str
            The text of the module.

        """
        with open(path, encoding="utf-8") as f:
            return f.read()

    def parse_module(self, module_name: str, path: str) -> Document:
        """
        Function for parsing a single Python module.

        Parameters
        ----------
        module_name : str
            A module name.
        path : str
            Path to the module.

        Returns
        -------
        Document
            A LLama Index Document object with extracted information from the module.

        """
        module_text = self.read_module_text(path)
        module = ast.parse(module_text)
        module_docstring = ast.get_docstring(module)
        module_text = f"Module name: {module_name} \n Docstring: {module_docstring} \n"
        sub_texts = []
        for elem in module.body:
            if type(elem) in TYPES_TO_PROCESS:
                sub_text = self.process_elem(elem, module_name)
                sub_texts.append(sub_text)
        module_text += "\n".join(sub_texts)
        return Document(text=module_text)

    def process_class(self, class_node: ast.ClassDef, parent_node: str):
        """
        Process a class node in the AST and add relevant information to the graph.

        Parameters
        ----------
        class_node : ast.ClassDef
            The class node to process. It represents a class definition
            in the abstract syntax tree (AST).
        parent_node : str
            The name of the parent node. It specifies the name of the parent node in the graph.

        Returns
        -------
        str
            A string representation of the processed class node and its sub-elements.
            It provides a textual representation of the processed class node and its sub-elements.

        """
        cls_name = class_node.name
        cls_docstring = ast.get_docstring(class_node)

        text = f"\n Class name: {cls_name}, In: {parent_node} \n Docstring: {cls_docstring}"
        sub_texts = []
        for elem in class_node.body:
            sub_text = self.process_elem(elem, cls_name)
            sub_texts.append(sub_text)
        return text + "\n".join(sub_texts)

    def process_function(self, func_node: ast.FunctionDef, parent_node: str) -> str:
        """
        Process a function node in the AST and add it to the graph. Build node text.

        Parameters
        ----------
        func_node : ast.FunctionDef
            The function node to process.
        parent_node : str
            The name of the parent node.

        Returns
        -------
        str
            A string representation of the processed function node with its sub-elements.

        """
        func_name = func_node.name
        func_docstring = ast.get_docstring(func_node)

        text = f"\n Function name: {func_name}, In: {parent_node} \n Docstring: {func_docstring}"
        sub_texts = []
        for elem in func_node.body:
            sub_text = self.process_elem(elem, func_name)
            sub_texts.append(sub_text)
        return text + "\n".join(sub_texts)

    def process_elem(self, elem, parent_node: str) -> str:
        """
        Process an element in the abstract syntax tree (AST).

        This is a generic function that delegates the execution to more specific
        functions based on the type of the element.

        Args:
            elem (ast.AST): The element to process.
            parent_node (str): The parent node in the graph.
            graph (nx.Graph): The graph to update.

        Returns:
            str: The result of processing the element.

        """
        if isinstance(elem, ast.FunctionDef):
            return self.process_function(elem, parent_node)
        elif isinstance(elem, ast.ClassDef):
            return self.process_class(elem, parent_node)
        return ""

加载数据 #

load_data(code_dir: str, skip_initpy: bool = True, fail_on_malformed_files: bool = False) -> List[Document]

从指定的代码目录加载数据。此外，在加载数据后，构建已加载文档之间的依赖关系图。该图将作为类的属性存储。

参数#

code_dir : str 代码文件的目录路径。 skip_initpy : bool 是否跳过init.py文件。默认为True。 fail_on_malformed_files : bool 是否在遇到格式错误的文件时报错。默认为False - 这种情况下，格式错误的文件会被跳过并记录警告。

返回#

List[Document] 已加载文档的列表。

Source code in llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py

def load_data(
    self,
    code_dir: str,
    skip_initpy: bool = True,
    fail_on_malformed_files: bool = False,
) -> List[Document]:
    """
    Load data from the specified code directory.
    Additionally, after loading the data, build a dependency graph between the loaded documents.
    The graph is stored as an attribute of the class.


    Parameters
    ----------
    code_dir : str
        The directory path to the code files.
    skip_initpy : bool
        Whether to skip the __init__.py files. Defaults to True.
    fail_on_malformed_files : bool
        Whether to fail on malformed files. Defaults to False - in this case,
        the malformed files are skipped and a warning is logged.

    Returns
    -------
    List[Document]
        A list of loaded documents.

    """
    return self.process_directory(code_dir, skip_initpy, fail_on_malformed_files)

process_directory #

process_directory(code_dir: str, skip_initpy: bool = True, fail_on_malformed_files: bool = False) -> List[Document]

处理一个目录并从Python文件中提取信息。

参数#

code_dir : str 代码文件的目录路径。 skip_initpy : bool 是否跳过init.py文件。默认为True。 fail_on_malformed_files : bool 是否在遇到格式错误的文件时报错。默认为False - 这种情况下，格式错误的文件会被跳过并记录警告。

返回#

List[Document] 一个Document对象的列表。

Source code in llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py

def process_directory(
    self,
    code_dir: str,
    skip_initpy: bool = True,
    fail_on_malformed_files: bool = False,
) -> List[Document]:
    """
    Process a directory and extract information from Python files.

    Parameters
    ----------
    code_dir : str
        The directory path to the code files.
    skip_initpy : bool
        Whether to skip the __init__.py files. Defaults to True.
    fail_on_malformed_files : bool
        Whether to fail on malformed files. Defaults to False - in this case,
        the malformed files are skipped and a warning is logged.

    Returns
    -------
    List[Document]
        A list of Document objects.

    """
    llama_docs = []
    for root, _, files in os.walk(code_dir):
        for file in files:
            if file.endswith(".py"):
                if skip_initpy and file == "__init__.py":
                    continue
                module_name = file.replace(".py", "")
                module_path = os.path.join(root, file)
                try:
                    doc = self.parse_module(module_name, module_path)
                    llama_docs.append(doc)
                except Exception as e:
                    if fail_on_malformed_files:
                        raise e  # noqa: TRY201
                    log.warning(
                        "Failed to parse file %s. Skipping. Error: %s",
                        module_path,
                        e,
                    )
                    continue
    return llama_docs

read_module_text #

read_module_text(path: str) -> str

读取Python模块的文本内容。在测试时，可以对该函数进行模拟。

参数#

路径 : str 模块的路径。

返回#

str 模块的文本内容。

Source code in llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py

def read_module_text(self, path: str) -> str:
    """
    Read the text of a Python module. For tests this function can be mocked.

    Parameters
    ----------
    path : str
        Path to the module.

    Returns
    -------
    str
        The text of the module.

    """
    with open(path, encoding="utf-8") as f:
        return f.read()

parse_module #

parse_module(module_name: str, path: str) -> Document

用于解析单个Python模块的函数。

参数#

module_name : str 模块名称。 path : str 模块路径。

返回#

文档一个LLama Index文档对象，包含从模块中提取的信息。

Source code in llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py

def parse_module(self, module_name: str, path: str) -> Document:
    """
    Function for parsing a single Python module.

    Parameters
    ----------
    module_name : str
        A module name.
    path : str
        Path to the module.

    Returns
    -------
    Document
        A LLama Index Document object with extracted information from the module.

    """
    module_text = self.read_module_text(path)
    module = ast.parse(module_text)
    module_docstring = ast.get_docstring(module)
    module_text = f"Module name: {module_name} \n Docstring: {module_docstring} \n"
    sub_texts = []
    for elem in module.body:
        if type(elem) in TYPES_TO_PROCESS:
            sub_text = self.process_elem(elem, module_name)
            sub_texts.append(sub_text)
    module_text += "\n".join(sub_texts)
    return Document(text=module_text)

process_class #

process_class(class_node: ClassDef, parent_node: str)

处理AST中的类节点，并将相关信息添加到图中。

参数#

class_node : ast.ClassDef 要处理的类节点。它表示抽象语法树(AST)中的一个类定义。 parent_node : str 父节点名称。它指定了图中父节点的名称。

返回#

str 处理后的类节点及其子元素的字符串表示形式。它提供了处理后的类节点及其子元素的文本表示形式。

Source code in llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py

def process_class(self, class_node: ast.ClassDef, parent_node: str):
    """
    Process a class node in the AST and add relevant information to the graph.

    Parameters
    ----------
    class_node : ast.ClassDef
        The class node to process. It represents a class definition
        in the abstract syntax tree (AST).
    parent_node : str
        The name of the parent node. It specifies the name of the parent node in the graph.

    Returns
    -------
    str
        A string representation of the processed class node and its sub-elements.
        It provides a textual representation of the processed class node and its sub-elements.

    """
    cls_name = class_node.name
    cls_docstring = ast.get_docstring(class_node)

    text = f"\n Class name: {cls_name}, In: {parent_node} \n Docstring: {cls_docstring}"
    sub_texts = []
    for elem in class_node.body:
        sub_text = self.process_elem(elem, cls_name)
        sub_texts.append(sub_text)
    return text + "\n".join(sub_texts)

process_function #

process_function(func_node: FunctionDef, parent_node: str) -> str

处理AST中的函数节点并将其添加到图中。构建节点文本。

参数#

func_node : ast.FunctionDef 要处理的函数节点。 parent_node : str 父节点的名称。

返回#

str 一个表示处理后的函数节点及其子元素的字符串。

Source code in llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py

def process_function(self, func_node: ast.FunctionDef, parent_node: str) -> str:
    """
    Process a function node in the AST and add it to the graph. Build node text.

    Parameters
    ----------
    func_node : ast.FunctionDef
        The function node to process.
    parent_node : str
        The name of the parent node.

    Returns
    -------
    str
        A string representation of the processed function node with its sub-elements.

    """
    func_name = func_node.name
    func_docstring = ast.get_docstring(func_node)

    text = f"\n Function name: {func_name}, In: {parent_node} \n Docstring: {func_docstring}"
    sub_texts = []
    for elem in func_node.body:
        sub_text = self.process_elem(elem, func_name)
        sub_texts.append(sub_text)
    return text + "\n".join(sub_texts)

process_elem #

process_elem(elem, parent_node: str) -> str

处理抽象语法树(AST)中的一个元素。

这是一个通用函数，根据元素的类型将执行委托给更具体的函数。

参数:

名称	类型	描述	默认值
`elem`	`AST`	要处理的元素。	required
`parent_node`	`str`	图中的父节点。	required
`graph`	`Graph`	要更新的图表。	required

返回：

名称	类型	描述
`str`	`str`	元素处理的结果。

Source code in llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py

def process_elem(self, elem, parent_node: str) -> str:
    """
    Process an element in the abstract syntax tree (AST).

    This is a generic function that delegates the execution to more specific
    functions based on the type of the element.

    Args:
        elem (ast.AST): The element to process.
        parent_node (str): The parent node in the graph.
        graph (nx.Graph): The graph to update.

    Returns:
        str: The result of processing the element.

    """
    if isinstance(elem, ast.FunctionDef):
        return self.process_function(elem, parent_node)
    elif isinstance(elem, ast.ClassDef):
        return self.process_class(elem, parent_node)
    return ""