Html

节点解析器。

HTML节点解析器 #

基类：EventNodeParser

HTML节点解析器。

使用自定义HTML分割逻辑将文档分割为节点。

参数：

名称	类型	描述	默认
`include_metadata`	`bool`	是否在节点中包含元数据	required
`include_prev_next_rel`	`bool`	是否包含上一页/下一页关系	required
`tags`	`List[str]`	用于提取文本的HTML标签。	`['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'b', 'i', 'u', 'section']`

workflows/handler.py 中的源代码llama_index/core/node_parser/file/html.py

class HTMLNodeParser(NodeParser):
    """
    HTML node parser.

    Splits a document into Nodes using custom HTML splitting logic.

    Args:
        include_metadata (bool): whether to include metadata in nodes
        include_prev_next_rel (bool): whether to include prev/next relationships

    """

    tags: List[str] = Field(
        default=DEFAULT_TAGS, description="HTML tags to extract text from."
    )

    @classmethod
    def from_defaults(
        cls,
        include_metadata: bool = True,
        include_prev_next_rel: bool = True,
        callback_manager: Optional[CallbackManager] = None,
        tags: Optional[List[str]] = DEFAULT_TAGS,
    ) -> "HTMLNodeParser":
        callback_manager = callback_manager or CallbackManager([])

        return cls(
            include_metadata=include_metadata,
            include_prev_next_rel=include_prev_next_rel,
            callback_manager=callback_manager,
            tags=tags,
        )

    @classmethod
    def class_name(cls) -> str:
        """Get class name."""
        return "HTMLNodeParser"

    def _parse_nodes(
        self,
        nodes: Sequence[BaseNode],
        show_progress: bool = False,
        **kwargs: Any,
    ) -> List[BaseNode]:
        all_nodes: List[BaseNode] = []
        nodes_with_progress = get_tqdm_iterable(nodes, show_progress, "Parsing nodes")

        for node in nodes_with_progress:
            nodes = self.get_nodes_from_node(node)
            all_nodes.extend(nodes)

        return all_nodes

    def get_nodes_from_node(self, node: BaseNode) -> List[TextNode]:
        """Get nodes from document."""
        try:
            from bs4 import BeautifulSoup, Tag
        except ImportError:
            raise ImportError("bs4 is required to read HTML files.")

        text = node.get_content(metadata_mode=MetadataMode.NONE)
        soup = BeautifulSoup(text, "html.parser")
        html_nodes = []
        last_tag = None
        current_section = ""

        tags = soup.find_all(self.tags)
        for tag in tags:
            tag_text = self._extract_text_from_tag(tag)
            if isinstance(tag, Tag) and (tag.name == last_tag or last_tag is None):
                last_tag = tag.name
                current_section += f"{tag_text.strip()}\n"
            else:
                html_nodes.append(
                    self._build_node_from_split(
                        current_section.strip(), node, {"tag": last_tag}
                    )
                )
                if isinstance(tag, Tag):
                    last_tag = tag.name
                current_section = f"{tag_text}\n"

        if current_section:
            html_nodes.append(
                self._build_node_from_split(
                    current_section.strip(), node, {"tag": last_tag}
                )
            )

        return html_nodes

    def _extract_text_from_tag(
        self, tag: Union["Tag", "NavigableString", "PageElement"]
    ) -> str:
        from bs4 import NavigableString, Tag, PageElement

        texts = []
        if isinstance(tag, Tag):
            for elem in tag.children:
                if isinstance(elem, NavigableString):
                    if elem.strip():
                        texts.append(elem.strip())
                elif isinstance(elem, Tag):
                    if elem.name in self.tags:
                        continue
                    else:
                        texts.append(elem.get_text().strip())
                elif isinstance(elem, PageElement):
                    texts.append(elem.get_text().strip())
        else:
            texts.append(tag.get_text().strip())
        return "\n".join(texts)

    def _build_node_from_split(
        self,
        text_split: str,
        node: BaseNode,
        metadata: dict,
    ) -> TextNode:
        """Build node from single text split."""
        node = build_nodes_from_splits([text_split], node, id_func=self.id_func)[0]

        if self.include_metadata:
            node.metadata = {**node.metadata, **metadata}

        return node

class_name `classmethod` #

class_name() -> str

获取类名。

workflows/handler.py 中的源代码llama_index/core/node_parser/file/html.py

@classmethod
def class_name(cls) -> str:
    """Get class name."""
    return "HTMLNodeParser"

get_nodes_from_node #

get_nodes_from_node(node: BaseNode) -> List[TextNode]

从文档中获取节点。

workflows/handler.py 中的源代码llama_index/core/node_parser/file/html.py

def get_nodes_from_node(self, node: BaseNode) -> List[TextNode]:
    """Get nodes from document."""
    try:
        from bs4 import BeautifulSoup, Tag
    except ImportError:
        raise ImportError("bs4 is required to read HTML files.")

    text = node.get_content(metadata_mode=MetadataMode.NONE)
    soup = BeautifulSoup(text, "html.parser")
    html_nodes = []
    last_tag = None
    current_section = ""

    tags = soup.find_all(self.tags)
    for tag in tags:
        tag_text = self._extract_text_from_tag(tag)
        if isinstance(tag, Tag) and (tag.name == last_tag or last_tag is None):
            last_tag = tag.name
            current_section += f"{tag_text.strip()}\n"
        else:
            html_nodes.append(
                self._build_node_from_split(
                    current_section.strip(), node, {"tag": last_tag}
                )
            )
            if isinstance(tag, Tag):
                last_tag = tag.name
            current_section = f"{tag_text}\n"

    if current_section:
        html_nodes.append(
            self._build_node_from_split(
                current_section.strip(), node, {"tag": last_tag}
            )
        )

    return html_nodes

JSON节点解析器 #

基类：EventNodeParser

JSON节点解析器。

使用自定义 JSON 拆分逻辑将文档拆分为节点。

参数：

名称	类型	描述	默认
`include_metadata`	`bool`	是否在节点中包含元数据	required
`include_prev_next_rel`	`bool`	是否包含上一页/下一页关系	required

workflows/handler.py 中的源代码llama_index/core/node_parser/file/json.py

class JSONNodeParser(NodeParser):
    """
    JSON node parser.

    Splits a document into Nodes using custom JSON splitting logic.

    Args:
        include_metadata (bool): whether to include metadata in nodes
        include_prev_next_rel (bool): whether to include prev/next relationships

    """

    @classmethod
    def from_defaults(
        cls,
        include_metadata: bool = True,
        include_prev_next_rel: bool = True,
        callback_manager: Optional[CallbackManager] = None,
    ) -> "JSONNodeParser":
        callback_manager = callback_manager or CallbackManager([])

        return cls(
            include_metadata=include_metadata,
            include_prev_next_rel=include_prev_next_rel,
            callback_manager=callback_manager,
        )

    @classmethod
    def class_name(cls) -> str:
        """Get class name."""
        return "JSONNodeParser"

    def _parse_nodes(
        self, nodes: Sequence[BaseNode], show_progress: bool = False, **kwargs: Any
    ) -> List[BaseNode]:
        all_nodes: List[BaseNode] = []
        nodes_with_progress = get_tqdm_iterable(nodes, show_progress, "Parsing nodes")

        for node in nodes_with_progress:
            nodes = self.get_nodes_from_node(node)
            all_nodes.extend(nodes)

        return all_nodes

    def get_nodes_from_node(self, node: BaseNode) -> List[TextNode]:
        """Get nodes from document."""
        text = node.get_content(metadata_mode=MetadataMode.NONE)
        try:
            data = json.loads(text)
        except json.JSONDecodeError:
            # Handle invalid JSON input here
            return []

        json_nodes = []
        if isinstance(data, dict):
            lines = [*self._depth_first_yield(data, 0, [])]
            json_nodes.extend(
                build_nodes_from_splits(["\n".join(lines)], node, id_func=self.id_func)
            )
        elif isinstance(data, list):
            for json_object in data:
                lines = [*self._depth_first_yield(json_object, 0, [])]
                json_nodes.extend(
                    build_nodes_from_splits(
                        ["\n".join(lines)], node, id_func=self.id_func
                    )
                )
        else:
            raise ValueError("JSON is invalid")

        return json_nodes

    def _depth_first_yield(
        self, json_data: Dict, levels_back: int, path: List[str]
    ) -> Generator[str, None, None]:
        """
        Do depth first yield of all of the leaf nodes of a JSON.

        Combines keys in the JSON tree using spaces.

        If levels_back is set to 0, prints all levels.

        """
        if isinstance(json_data, dict):
            for key, value in json_data.items():
                new_path = path[:]
                new_path.append(key)
                yield from self._depth_first_yield(value, levels_back, new_path)
        elif isinstance(json_data, list):
            for _, value in enumerate(json_data):
                yield from self._depth_first_yield(value, levels_back, path)
        else:
            new_path = path[-levels_back:]
            new_path.append(str(json_data))
            yield " ".join(new_path)

class_name `classmethod` #

class_name() -> str

获取类名。

workflows/handler.py 中的源代码llama_index/core/node_parser/file/json.py

@classmethod
def class_name(cls) -> str:
    """Get class name."""
    return "JSONNodeParser"

get_nodes_from_node #

get_nodes_from_node(node: BaseNode) -> List[TextNode]

从文档中获取节点。

workflows/handler.py 中的源代码llama_index/core/node_parser/file/json.py

def get_nodes_from_node(self, node: BaseNode) -> List[TextNode]:
    """Get nodes from document."""
    text = node.get_content(metadata_mode=MetadataMode.NONE)
    try:
        data = json.loads(text)
    except json.JSONDecodeError:
        # Handle invalid JSON input here
        return []

    json_nodes = []
    if isinstance(data, dict):
        lines = [*self._depth_first_yield(data, 0, [])]
        json_nodes.extend(
            build_nodes_from_splits(["\n".join(lines)], node, id_func=self.id_func)
        )
    elif isinstance(data, list):
        for json_object in data:
            lines = [*self._depth_first_yield(json_object, 0, [])]
            json_nodes.extend(
                build_nodes_from_splits(
                    ["\n".join(lines)], node, id_func=self.id_func
                )
            )
    else:
        raise ValueError("JSON is invalid")

    return json_nodes

Markdown节点解析器 #

基类：EventNodeParser

Markdown 节点解析器。

使用基于Markdown标题的分割逻辑将文档分割为节点。每个节点包含其文本内容以及指向它的标题路径。

参数：

名称	类型	描述	默认
`include_metadata`	`bool`	是否在节点中包含元数据	required
`include_prev_next_rel`	`bool`	是否包含上一页/下一页关系	required
`header_path_separator`	`str`	用于章节标题路径元数据的分隔符字符	`'/'`

workflows/handler.py 中的源代码llama_index/core/node_parser/file/markdown.py

class MarkdownNodeParser(NodeParser):
    """
    Markdown node parser.

    Splits a document into Nodes using Markdown header-based splitting logic.
    Each node contains its text content and the path of headers leading to it.

    Args:
        include_metadata (bool): whether to include metadata in nodes
        include_prev_next_rel (bool): whether to include prev/next relationships
        header_path_separator (str): separator char used for section header path metadata

    """

    header_path_separator: str = Field(
        default="/", description="Separator char used for section header path metadata."
    )

    @classmethod
    def from_defaults(
        cls,
        include_metadata: bool = True,
        include_prev_next_rel: bool = True,
        header_path_separator: str = "/",
        callback_manager: Optional[CallbackManager] = None,
    ) -> "MarkdownNodeParser":
        callback_manager = callback_manager or CallbackManager([])
        return cls(
            include_metadata=include_metadata,
            include_prev_next_rel=include_prev_next_rel,
            header_path_separator=header_path_separator,
            callback_manager=callback_manager,
        )

    def get_nodes_from_node(self, node: BaseNode) -> List[TextNode]:
        """Get nodes from document by splitting on headers."""
        text = node.get_content(metadata_mode=MetadataMode.NONE)
        markdown_nodes = []
        lines = text.split("\n")
        current_section = ""
        # Keep track of (markdown level, text) for headers
        header_stack: List[tuple[int, str]] = []
        code_block = False

        for line in lines:
            # Track if we're inside a code block to avoid parsing headers in code
            if line.lstrip().startswith("```"):
                code_block = not code_block
                current_section += line + "\n"
                continue

            # Only parse headers if we're not in a code block
            if not code_block:
                header_match = re.match(r"^(#+)\s(.*)", line)
                if header_match:
                    # Save the previous section before starting a new one
                    if current_section.strip():
                        markdown_nodes.append(
                            self._build_node_from_split(
                                current_section.strip(),
                                node,
                                self.header_path_separator.join(
                                    h[1] for h in header_stack[:-1]
                                ),
                            )
                        )

                    header_level = len(header_match.group(1))
                    header_text = header_match.group(2)

                    # Compare against top-of-stack item’s markdown level.
                    # Pop headers of equal or higher markdown level; not necessarily current stack size / depth.
                    # Hierarchy depth gets deeper one level at a time, but markdown headers can jump from H1 to H3, for example.
                    while header_stack and header_stack[-1][0] >= header_level:
                        header_stack.pop()

                    # Add the new header
                    header_stack.append((header_level, header_text))
                    current_section = "#" * header_level + f" {header_text}\n"
                    continue

            current_section += line + "\n"

        # Add the final section
        if current_section.strip():
            markdown_nodes.append(
                self._build_node_from_split(
                    current_section.strip(),
                    node,
                    self.header_path_separator.join(h[1] for h in header_stack[:-1]),
                )
            )

        return markdown_nodes

    def _build_node_from_split(
        self,
        text_split: str,
        node: BaseNode,
        header_path: str,
    ) -> TextNode:
        """Build node from single text split."""
        node = build_nodes_from_splits([text_split], node, id_func=self.id_func)[0]

        if self.include_metadata:
            separator = self.header_path_separator
            node.metadata["header_path"] = (
                # ex: "/header1/header2/" || "/"
                separator + header_path + separator if header_path else separator
            )

        return node

    def _parse_nodes(
        self,
        nodes: Sequence[BaseNode],
        show_progress: bool = False,
        **kwargs: Any,
    ) -> List[BaseNode]:
        """Parse nodes."""
        all_nodes: List[BaseNode] = []
        nodes_with_progress = get_tqdm_iterable(nodes, show_progress, "Parsing nodes")

        for node in nodes_with_progress:
            nodes = self.get_nodes_from_node(node)
            all_nodes.extend(nodes)

        return all_nodes

get_nodes_from_node #

get_nodes_from_node(node: BaseNode) -> List[TextNode]

通过按标题分割从文档中获取节点。

workflows/handler.py 中的源代码llama_index/core/node_parser/file/markdown.py

def get_nodes_from_node(self, node: BaseNode) -> List[TextNode]:
    """Get nodes from document by splitting on headers."""
    text = node.get_content(metadata_mode=MetadataMode.NONE)
    markdown_nodes = []
    lines = text.split("\n")
    current_section = ""
    # Keep track of (markdown level, text) for headers
    header_stack: List[tuple[int, str]] = []
    code_block = False

    for line in lines:
        # Track if we're inside a code block to avoid parsing headers in code
        if line.lstrip().startswith("```"):
            code_block = not code_block
            current_section += line + "\n"
            continue

        # Only parse headers if we're not in a code block
        if not code_block:
            header_match = re.match(r"^(#+)\s(.*)", line)
            if header_match:
                # Save the previous section before starting a new one
                if current_section.strip():
                    markdown_nodes.append(
                        self._build_node_from_split(
                            current_section.strip(),
                            node,
                            self.header_path_separator.join(
                                h[1] for h in header_stack[:-1]
                            ),
                        )
                    )

                header_level = len(header_match.group(1))
                header_text = header_match.group(2)

                # Compare against top-of-stack item’s markdown level.
                # Pop headers of equal or higher markdown level; not necessarily current stack size / depth.
                # Hierarchy depth gets deeper one level at a time, but markdown headers can jump from H1 to H3, for example.
                while header_stack and header_stack[-1][0] >= header_level:
                    header_stack.pop()

                # Add the new header
                header_stack.append((header_level, header_text))
                current_section = "#" * header_level + f" {header_text}\n"
                continue

        current_section += line + "\n"

    # Add the final section
    if current_section.strip():
        markdown_nodes.append(
            self._build_node_from_split(
                current_section.strip(),
                node,
                self.header_path_separator.join(h[1] for h in header_stack[:-1]),
            )
        )

    return markdown_nodes

简单文件节点解析器 #

基类：EventNodeParser

简单文件节点解析器。

根据文件类型自动将加载的文档拆分为节点基于文件类型自动检测要使用的节点解析器

参数：

名称	类型	描述	默认
`include_metadata`	`bool`	是否在节点中包含元数据	required
`include_prev_next_rel`	`bool`	是否包含上一页/下一页关系	required

workflows/handler.py 中的源代码llama_index/core/node_parser/file/simple_file.py

class SimpleFileNodeParser(NodeParser):
    """
    Simple file node parser.

    Splits a document loaded from a file into Nodes using logic based on the file type
    automatically detects the NodeParser to use based on file type

    Args:
        include_metadata (bool): whether to include metadata in nodes
        include_prev_next_rel (bool): whether to include prev/next relationships

    """

    @classmethod
    def from_defaults(
        cls,
        include_metadata: bool = True,
        include_prev_next_rel: bool = True,
        callback_manager: Optional[CallbackManager] = None,
    ) -> "SimpleFileNodeParser":
        callback_manager = callback_manager or CallbackManager([])

        return cls(
            include_metadata=include_metadata,
            include_prev_next_rel=include_prev_next_rel,
            callback_manager=callback_manager,
        )

    @classmethod
    def class_name(cls) -> str:
        """Get class name."""
        return "SimpleFileNodeParser"

    def _parse_nodes(
        self,
        nodes: Sequence[BaseNode],
        show_progress: bool = False,
        **kwargs: Any,
    ) -> List[BaseNode]:
        """
        Parse document into nodes.

        Args:
            nodes (Sequence[BaseNode]): nodes to parse

        """
        all_nodes: List[BaseNode] = []
        documents_with_progress = get_tqdm_iterable(
            nodes, show_progress, "Parsing documents into nodes"
        )

        for document in documents_with_progress:
            # Try to get extension from metadata, or extract from file_path
            ext = document.metadata.get("extension")
            if ext is None and "file_path" in document.metadata:
                # Extract extension from file_path
                _, ext = os.path.splitext(document.metadata["file_path"])
                ext = ext.lower()

            if ext and ext in FILE_NODE_PARSERS:
                parser = FILE_NODE_PARSERS[ext](
                    include_metadata=self.include_metadata,
                    include_prev_next_rel=self.include_prev_next_rel,
                    callback_manager=self.callback_manager,
                )

                nodes = parser.get_nodes_from_documents([document], show_progress)
                all_nodes.extend(nodes)
            else:
                # What to do when file type isn't supported yet?
                all_nodes.extend(
                    # build node from document
                    build_nodes_from_splits(
                        [document.get_content(metadata_mode=MetadataMode.NONE)],
                        document,
                        id_func=self.id_func,
                    )
                )

        return all_nodes

class_name `classmethod` #

class_name() -> str

获取类名。

workflows/handler.py 中的源代码llama_index/core/node_parser/file/simple_file.py

@classmethod
def class_name(cls) -> str:
    """Get class name."""
    return "SimpleFileNodeParser"

MetadataAwareTextSplitter #

基类：EventTextSplitter

workflows/handler.py 中的源代码llama_index/core/node_parser/interface.py

class MetadataAwareTextSplitter(TextSplitter):
    @abstractmethod
    def split_text_metadata_aware(self, text: str, metadata_str: str) -> List[str]: ...

    def split_texts_metadata_aware(
        self, texts: List[str], metadata_strs: List[str]
    ) -> List[str]:
        if len(texts) != len(metadata_strs):
            raise ValueError("Texts and metadata_strs must have the same length")
        nested_texts = [
            self.split_text_metadata_aware(text, metadata)
            for text, metadata in zip(texts, metadata_strs)
        ]
        return [item for sublist in nested_texts for item in sublist]

    def _get_metadata_str(self, node: BaseNode) -> str:
        """Helper function to get the proper metadata str for splitting."""
        embed_metadata_str = node.get_metadata_str(mode=MetadataMode.EMBED)
        llm_metadata_str = node.get_metadata_str(mode=MetadataMode.LLM)

        # use the longest metadata str for splitting
        if len(embed_metadata_str) > len(llm_metadata_str):
            metadata_str = embed_metadata_str
        else:
            metadata_str = llm_metadata_str

        return metadata_str

    def _parse_nodes(
        self, nodes: Sequence[BaseNode], show_progress: bool = False, **kwargs: Any
    ) -> List[BaseNode]:
        all_nodes: List[BaseNode] = []
        nodes_with_progress = get_tqdm_iterable(nodes, show_progress, "Parsing nodes")

        for node in nodes_with_progress:
            metadata_str = self._get_metadata_str(node)
            splits = self.split_text_metadata_aware(
                node.get_content(metadata_mode=MetadataMode.NONE),
                metadata_str=metadata_str,
            )
            all_nodes.extend(
                build_nodes_from_splits(splits, node, id_func=self.id_func)
            )

        return all_nodes

NodeParser #

Bases: TransformComponent, ABC

节点解析器的基础接口。

参数：

名称	类型	描述	默认
`include_metadata`	`bool`	分割时是否考虑元数据。	`True`
`include_prev_next_rel`	`bool`	包含上一个/下一个节点关系。	`True`
`callback_manager`	`CallbackManager`		`<llama_index.core.callbacks.base.CallbackManager object at 0x7f82e1235730>`
`id_func`	`Annotated[Callable, FieldInfo, BeforeValidator, WithJsonSchema, WithJsonSchema, PlainSerializer] \| None`	生成节点ID的函数。	`None`

workflows/handler.py 中的源代码llama_index/core/node_parser/interface.py

class NodeParser(TransformComponent, ABC):
    """Base interface for node parser."""

    model_config = ConfigDict(arbitrary_types_allowed=True)
    include_metadata: bool = Field(
        default=True, description="Whether or not to consider metadata when splitting."
    )
    include_prev_next_rel: bool = Field(
        default=True, description="Include prev/next node relationships."
    )
    callback_manager: CallbackManager = Field(
        default_factory=lambda: CallbackManager([]), exclude=True
    )
    id_func: IdFuncCallable = Field(
        default=default_id_func,
        description="Function to generate node IDs.",
    )

    @abstractmethod
    def _parse_nodes(
        self,
        nodes: Sequence[BaseNode],
        show_progress: bool = False,
        **kwargs: Any,
    ) -> List[BaseNode]: ...

    async def _aparse_nodes(
        self,
        nodes: Sequence[BaseNode],
        show_progress: bool = False,
        **kwargs: Any,
    ) -> List[BaseNode]:
        return self._parse_nodes(nodes, show_progress=show_progress, **kwargs)

    def _postprocess_parsed_nodes(
        self, nodes: List[BaseNode], parent_doc_map: Dict[str, Document]
    ) -> List[BaseNode]:
        # Track search position per document to handle duplicate text correctly
        # Nodes are assumed to be in document order from _parse_nodes
        # We track the START position (not end) to allow for overlapping chunks
        doc_search_positions: Dict[str, int] = {}

        for i, node in enumerate(nodes):
            parent_doc = parent_doc_map.get(node.ref_doc_id or "", None)
            parent_node = node.source_node

            if parent_doc is not None:
                if parent_doc.source_node is not None:
                    node.relationships.update(
                        {
                            NodeRelationship.SOURCE: parent_doc.source_node,
                        }
                    )

                # Get or initialize search position for this document
                doc_id = node.ref_doc_id or ""
                search_start = doc_search_positions.get(doc_id, 0)

                # Search for node content starting from the last found position
                node_content = node.get_content(metadata_mode=MetadataMode.NONE)
                start_char_idx = parent_doc.text.find(node_content, search_start)

                # update start/end char idx
                if start_char_idx >= 0 and isinstance(node, TextNode):
                    node.start_char_idx = start_char_idx
                    node.end_char_idx = start_char_idx + len(node_content)
                    # Update search position to start from next character after this node's START
                    # This allows overlapping chunks to be found correctly
                    doc_search_positions[doc_id] = start_char_idx + 1

                # update metadata
                if self.include_metadata:
                    # Merge parent_doc.metadata into nodes.metadata, giving preference to node's values
                    node.metadata = {**parent_doc.metadata, **node.metadata}

            if parent_node is not None:
                if self.include_metadata:
                    parent_metadata = parent_node.metadata

                    combined_metadata = {**parent_metadata, **node.metadata}

                    # Merge parent_node.metadata into nodes.metadata, giving preference to node's values
                    node.metadata.update(combined_metadata)

            if self.include_prev_next_rel:
                # establish prev/next relationships if nodes share the same source_node
                if (
                    i > 0
                    and node.source_node
                    and nodes[i - 1].source_node
                    and nodes[i - 1].source_node.node_id == node.source_node.node_id  # type: ignore
                ):
                    node.relationships[NodeRelationship.PREVIOUS] = nodes[
                        i - 1
                    ].as_related_node_info()
                if (
                    i < len(nodes) - 1
                    and node.source_node
                    and nodes[i + 1].source_node
                    and nodes[i + 1].source_node.node_id == node.source_node.node_id  # type: ignore
                ):
                    node.relationships[NodeRelationship.NEXT] = nodes[
                        i + 1
                    ].as_related_node_info()

        return nodes

    def get_nodes_from_documents(
        self,
        documents: Sequence[Document],
        show_progress: bool = False,
        **kwargs: Any,
    ) -> List[BaseNode]:
        """
        Parse documents into nodes.

        Args:
            documents (Sequence[Document]): documents to parse
            show_progress (bool): whether to show progress bar

        """
        doc_id_to_document = {doc.id_: doc for doc in documents}

        with self.callback_manager.event(
            CBEventType.NODE_PARSING, payload={EventPayload.DOCUMENTS: documents}
        ) as event:
            nodes = self._parse_nodes(documents, show_progress=show_progress, **kwargs)
            nodes = self._postprocess_parsed_nodes(nodes, doc_id_to_document)

            event.on_end({EventPayload.NODES: nodes})

        return nodes

    async def aget_nodes_from_documents(
        self,
        documents: Sequence[Document],
        show_progress: bool = False,
        **kwargs: Any,
    ) -> List[BaseNode]:
        doc_id_to_document = {doc.id_: doc for doc in documents}

        with self.callback_manager.event(
            CBEventType.NODE_PARSING, payload={EventPayload.DOCUMENTS: documents}
        ) as event:
            nodes = await self._aparse_nodes(
                documents, show_progress=show_progress, **kwargs
            )
            nodes = self._postprocess_parsed_nodes(nodes, doc_id_to_document)

            event.on_end({EventPayload.NODES: nodes})

        return nodes

    def __call__(self, nodes: Sequence[BaseNode], **kwargs: Any) -> List[BaseNode]:
        return self.get_nodes_from_documents(nodes, **kwargs)  # type: ignore

    async def acall(self, nodes: Sequence[BaseNode], **kwargs: Any) -> List[BaseNode]:
        return await self.aget_nodes_from_documents(nodes, **kwargs)  # type: ignore

get_nodes_from_documents #

get_nodes_from_documents(documents: Sequence[文档], show_progress: bool = False, **kwargs: Any) -> List[BaseNode]

将文档解析为节点。

参数：

名称	类型	描述	默认
`documents`	`Sequence[文档]`	待解析的文档	required
`show_progress`	`bool`	是否显示进度条	`False`

workflows/handler.py 中的源代码llama_index/core/node_parser/interface.py

def get_nodes_from_documents(
    self,
    documents: Sequence[Document],
    show_progress: bool = False,
    **kwargs: Any,
) -> List[BaseNode]:
    """
    Parse documents into nodes.

    Args:
        documents (Sequence[Document]): documents to parse
        show_progress (bool): whether to show progress bar

    """
    doc_id_to_document = {doc.id_: doc for doc in documents}

    with self.callback_manager.event(
        CBEventType.NODE_PARSING, payload={EventPayload.DOCUMENTS: documents}
    ) as event:
        nodes = self._parse_nodes(documents, show_progress=show_progress, **kwargs)
        nodes = self._postprocess_parsed_nodes(nodes, doc_id_to_document)

        event.on_end({EventPayload.NODES: nodes})

    return nodes

TextSplitter #

基类：EventNodeParser

workflows/handler.py 中的源代码llama_index/core/node_parser/interface.py

class TextSplitter(NodeParser):
    @abstractmethod
    def split_text(self, text: str) -> List[str]: ...

    def split_texts(self, texts: List[str]) -> List[str]:
        nested_texts = [self.split_text(text) for text in texts]
        return [item for sublist in nested_texts for item in sublist]

    def _parse_nodes(
        self, nodes: Sequence[BaseNode], show_progress: bool = False, **kwargs: Any
    ) -> List[BaseNode]:
        all_nodes: List[BaseNode] = []
        nodes_with_progress = get_tqdm_iterable(nodes, show_progress, "Parsing nodes")
        for node in nodes_with_progress:
            splits = self.split_text(node.get_content())

            all_nodes.extend(
                build_nodes_from_splits(splits, node, id_func=self.id_func)
            )

        return all_nodes

分层节点解析器 #

基类：EventNodeParser

分层节点解析器。

使用节点解析器将文档分割成递归层次结构的节点。

注意：这将返回一个扁平列表中的节点层次结构，其中父节点（例如使用较大的分块大小）与每个父节点下的子节点（例如使用较小的分块大小）之间会存在重叠。

例如，这可能返回一个节点列表，如：

顶级节点列表，块大小为2048
二级节点列表，其中每个节点都是顶级节点的子节点，块大小 512
三级节点列表，其中每个节点都是二级节点的子节点，分块大小 128

参数：

名称	类型	描述	默认
`chunk_sizes`	`List[int] \| None`	分割文档时使用的块大小，按层级顺序排列。	`None`
`node_parser_ids`	`List[str]`	用于在拆分文档时使用的节点解析器ID列表，按层级顺序排列（第一个ID用于第一级，依此类推）。	`<dynamic>`
`node_parser_map`	`Dict[str, NodeParser]`	节点解析器标识到节点解析器的映射。	required

workflows/handler.py 中的源代码llama_index/core/node_parser/relational/hierarchical.py

class HierarchicalNodeParser(NodeParser):
    """
    Hierarchical node parser.

    Splits a document into a recursive hierarchy Nodes using a NodeParser.

    NOTE: this will return a hierarchy of nodes in a flat list, where there will be
    overlap between parent nodes (e.g. with a bigger chunk size), and child nodes
    per parent (e.g. with a smaller chunk size).

    For instance, this may return a list of nodes like:

    - list of top-level nodes with chunk size 2048
    - list of second-level nodes, where each node is a child of a top-level node,
      chunk size 512
    - list of third-level nodes, where each node is a child of a second-level node,
      chunk size 128
    """

    chunk_sizes: Optional[List[int]] = Field(
        default=None,
        description=(
            "The chunk sizes to use when splitting documents, in order of level."
        ),
    )
    node_parser_ids: List[str] = Field(
        default_factory=list,
        description=(
            "List of ids for the node parsers to use when splitting documents, "
            + "in order of level (first id used for first level, etc.)."
        ),
    )
    node_parser_map: Dict[str, NodeParser] = Field(
        description="Map of node parser id to node parser.",
    )

    @classmethod
    def from_defaults(
        cls,
        chunk_sizes: Optional[List[int]] = None,
        chunk_overlap: int = 20,
        node_parser_ids: Optional[List[str]] = None,
        node_parser_map: Optional[Dict[str, NodeParser]] = None,
        include_metadata: bool = True,
        include_prev_next_rel: bool = True,
        callback_manager: Optional[CallbackManager] = None,
    ) -> "HierarchicalNodeParser":
        callback_manager = callback_manager or CallbackManager([])

        if node_parser_ids is None:
            if chunk_sizes is None:
                chunk_sizes = [2048, 512, 128]

            node_parser_ids = [f"chunk_size_{chunk_size}" for chunk_size in chunk_sizes]
            node_parser_map = {}
            for chunk_size, node_parser_id in zip(chunk_sizes, node_parser_ids):
                node_parser_map[node_parser_id] = SentenceSplitter(
                    chunk_size=chunk_size,
                    callback_manager=callback_manager,
                    chunk_overlap=chunk_overlap,
                    include_metadata=include_metadata,
                    include_prev_next_rel=include_prev_next_rel,
                )
        else:
            if chunk_sizes is not None:
                raise ValueError("Cannot specify both node_parser_ids and chunk_sizes.")
            if node_parser_map is None:
                raise ValueError(
                    "Must specify node_parser_map if using node_parser_ids."
                )

        return cls(
            chunk_sizes=chunk_sizes,
            node_parser_ids=node_parser_ids,
            node_parser_map=node_parser_map,
            include_metadata=include_metadata,
            include_prev_next_rel=include_prev_next_rel,
            callback_manager=callback_manager,
        )

    @classmethod
    def class_name(cls) -> str:
        return "HierarchicalNodeParser"

    def _recursively_get_nodes_from_nodes(
        self,
        nodes: List[BaseNode],
        level: int,
        show_progress: bool = False,
    ) -> List[BaseNode]:
        """Recursively get nodes from nodes."""
        if level >= len(self.node_parser_ids):
            raise ValueError(
                f"Level {level} is greater than number of text "
                f"splitters ({len(self.node_parser_ids)})."
            )

        # first split current nodes into sub-nodes
        nodes_with_progress = get_tqdm_iterable(
            nodes, show_progress, "Parsing documents into nodes"
        )
        sub_nodes = []
        for node in nodes_with_progress:
            cur_sub_nodes = self.node_parser_map[
                self.node_parser_ids[level]
            ].get_nodes_from_documents([node])
            # add parent relationship from sub node to parent node
            # add child relationship from parent node to sub node
            # NOTE: Only add relationships if level > 0, since we don't want to add
            # relationships for the top-level document objects that we are splitting
            if level > 0:
                for sub_node in cur_sub_nodes:
                    _add_parent_child_relationship(
                        parent_node=node,
                        child_node=sub_node,
                    )

            sub_nodes.extend(cur_sub_nodes)

        # now for each sub-node, recursively split into sub-sub-nodes, and add
        if level < len(self.node_parser_ids) - 1:
            sub_sub_nodes = self._recursively_get_nodes_from_nodes(
                sub_nodes,
                level + 1,
                show_progress=show_progress,
            )
        else:
            sub_sub_nodes = []

        return sub_nodes + sub_sub_nodes

    def get_nodes_from_documents(
        self,
        documents: Sequence[Document],
        show_progress: bool = False,
        **kwargs: Any,
    ) -> List[BaseNode]:
        """Parse document into nodes."""
        with self.callback_manager.event(
            CBEventType.NODE_PARSING, payload={EventPayload.DOCUMENTS: documents}
        ) as event:
            all_nodes: List[BaseNode] = []
            documents_with_progress = get_tqdm_iterable(
                documents, show_progress, "Parsing documents into nodes"
            )

            # TODO: a bit of a hack rn for tqdm
            for doc in documents_with_progress:
                nodes_from_doc = self._recursively_get_nodes_from_nodes([doc], 0)
                all_nodes.extend(nodes_from_doc)

            event.on_end(payload={EventPayload.NODES: all_nodes})

        return all_nodes

    # Unused abstract method
    def _parse_nodes(
        self, nodes: Sequence[BaseNode], show_progress: bool = False, **kwargs: Any
    ) -> List[BaseNode]:
        return list(nodes)

get_nodes_from_documents #

get_nodes_from_documents(documents: Sequence[文档], show_progress: bool = False, **kwargs: Any) -> List[BaseNode]

将文档解析为节点。

workflows/handler.py 中的源代码llama_index/core/node_parser/relational/hierarchical.py

def get_nodes_from_documents(
    self,
    documents: Sequence[Document],
    show_progress: bool = False,
    **kwargs: Any,
) -> List[BaseNode]:
    """Parse document into nodes."""
    with self.callback_manager.event(
        CBEventType.NODE_PARSING, payload={EventPayload.DOCUMENTS: documents}
    ) as event:
        all_nodes: List[BaseNode] = []
        documents_with_progress = get_tqdm_iterable(
            documents, show_progress, "Parsing documents into nodes"
        )

        # TODO: a bit of a hack rn for tqdm
        for doc in documents_with_progress:
            nodes_from_doc = self._recursively_get_nodes_from_nodes([doc], 0)
            all_nodes.extend(nodes_from_doc)

        event.on_end(payload={EventPayload.NODES: all_nodes})

    return all_nodes

Markdown元素节点解析器 #

基类：EventBaseElementNodeParser

Markdown 元素节点解析器。

将Markdown文档分割为文本节点和索引节点，分别对应嵌入对象（例如表格）。

workflows/handler.py 中的源代码llama_index/core/node_parser/relational/markdown_element.py

class MarkdownElementNodeParser(BaseElementNodeParser):
    """
    Markdown element node parser.

    Splits a markdown document into Text Nodes and Index Nodes corresponding to embedded objects
    (e.g. tables).

    """

    @classmethod
    def class_name(cls) -> str:
        return "MarkdownElementNodeParser"

    def get_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
        """Get nodes from node."""
        elements = self.extract_elements(
            node.get_content(), table_filters=[self.filter_table], node_id=node.node_id
        )
        elements = self.extract_html_tables(elements)
        table_elements = self.get_table_elements(elements)
        # extract summaries over table elements
        self.extract_table_summaries(table_elements)
        # convert into nodes
        # will return a list of Nodes and Index Nodes
        nodes = self.get_nodes_from_elements(
            elements, node, ref_doc_text=node.get_content()
        )
        source_document = node.source_node or node.as_related_node_info()
        for n in nodes:
            n.relationships[NodeRelationship.SOURCE] = source_document
            n.metadata.update(node.metadata)
        return nodes

    async def aget_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
        """Get nodes from node."""
        elements = self.extract_elements(
            node.get_content(), table_filters=[self.filter_table], node_id=node.node_id
        )
        table_elements = self.get_table_elements(elements)
        # extract summaries over table elements
        await self.aextract_table_summaries(table_elements)
        # convert into nodes
        # will return a list of Nodes and Index Nodes
        nodes = self.get_nodes_from_elements(
            elements, node, ref_doc_text=node.get_content()
        )
        source_document = node.source_node or node.as_related_node_info()
        for n in nodes:
            n.relationships[NodeRelationship.SOURCE] = source_document
            n.metadata.update(node.metadata)
        return nodes

    def extract_html_tables(self, elements: List[Element]) -> List[Element]:
        """
        Extract html tables from text.

        Returns:
            List[Element]: text elements split by table_text element

        """
        new_elements = []
        for element in elements:
            if element.type != "text":
                # skip when it is not text
                new_elements.append(element)
                continue
            else:
                text = element.element
                last_pos = 0
                i = 0
                n = len(text)

                while i < n:
                    table_start = text.find("<table>", i)
                    if table_start == -1:
                        break

                    table_end = text.find("</table>", table_start)
                    if table_end - table_start <= 7:
                        # not a valid <table></table>
                        break

                    # extract text before the table
                    pre_text = text[last_pos:table_start].strip()
                    if pre_text:
                        new_elements.append(
                            Element(
                                id=f"{element.id}_{len(new_elements)}",
                                type="text",
                                element=pre_text,
                            )
                        )

                    # extract the html table
                    table_content = text[
                        table_start : table_end + 8
                    ]  # 8 is length of </table>
                    new_elements.append(
                        Element(
                            id=f"{element.id}_{len(new_elements)}",
                            type="table_text",
                            element=table_content,
                        )
                    )

                    last_pos = table_end + 8
                    i = last_pos

                # add the last piece of text
                final_text = text[last_pos:].strip()
                if final_text:
                    new_elements.append(
                        Element(
                            id=f"{element.id}_{len(new_elements)}",
                            type="text",
                            element=final_text,
                        )
                    )

        return new_elements

    def extract_elements(
        self,
        text: str,
        node_id: Optional[str] = None,
        table_filters: Optional[List[Callable]] = None,
        **kwargs: Any,
    ) -> List[Element]:
        # get node id for each node so that we can avoid using the same id for different nodes
        """Extract elements from text."""
        lines = text.split("\n")
        currentElement = None

        elements: List[Element] = []
        # Then parse the lines
        for line in lines:
            if line.startswith("```"):
                # check if this is the end of a code block
                if currentElement is not None and currentElement.type == "code":
                    elements.append(currentElement)
                    currentElement = None
                    # if there is some text after the ``` create a text element with it
                    if len(line) > 3:
                        elements.append(
                            Element(
                                id=f"id_{len(elements)}",
                                type="text",
                                element=line.lstrip("```"),
                            )
                        )

                elif line.count("```") == 2 and line[-3] != "`":
                    # check if inline code block (aka have a second ``` in line but not at the end)
                    if currentElement is not None:
                        elements.append(currentElement)
                    currentElement = Element(
                        id=f"id_{len(elements)}",
                        type="code",
                        element=line.lstrip("```"),
                    )
                elif currentElement is not None and currentElement.type == "text":
                    currentElement.element += "\n" + line
                else:
                    if currentElement is not None:
                        elements.append(currentElement)
                    currentElement = Element(
                        id=f"id_{len(elements)}", type="text", element=line
                    )
            elif currentElement is not None and currentElement.type == "code":
                currentElement.element += "\n" + line

            elif line.startswith("|"):
                if currentElement is not None and currentElement.type != "table":
                    if currentElement is not None:
                        elements.append(currentElement)
                    currentElement = Element(
                        id=f"id_{len(elements)}", type="table", element=line
                    )
                elif currentElement is not None:
                    currentElement.element += "\n" + line
                else:
                    currentElement = Element(
                        id=f"id_{len(elements)}", type="table", element=line
                    )
            elif line.startswith("#"):
                if currentElement is not None:
                    elements.append(currentElement)
                currentElement = Element(
                    id=f"id_{len(elements)}",
                    type="title",
                    element=line.lstrip("#"),
                    title_level=len(line) - len(line.lstrip("#")),
                )
            else:
                if currentElement is not None and currentElement.type != "text":
                    elements.append(currentElement)
                    currentElement = Element(
                        id=f"id_{len(elements)}", type="text", element=line
                    )
                elif currentElement is not None:
                    currentElement.element += "\n" + line
                else:
                    currentElement = Element(
                        id=f"id_{len(elements)}", type="text", element=line
                    )
        if currentElement is not None:
            elements.append(currentElement)

        for idx, element in enumerate(elements):
            if element.type == "table":
                should_keep = True
                perfect_table = True

                # verify that the table (markdown) have the same number of columns on each rows
                table_lines = element.element.split("\n")
                table_columns = [len(line.split("|")) for line in table_lines]
                if len(set(table_columns)) > 1:
                    # if the table have different number of columns on each rows, it's not a perfect table
                    # we will store the raw text for such tables instead of converting them to a dataframe
                    perfect_table = False

                # verify that the table (markdown) have at least 2 rows
                if len(table_lines) < 2:
                    should_keep = False

                # apply the table filter, now only filter empty tables
                if should_keep and perfect_table and table_filters is not None:
                    should_keep = all(tf(element) for tf in table_filters)

                # if the element is a table, convert it to a dataframe
                if should_keep:
                    if perfect_table:
                        table = md_to_df(element.element)

                        elements[idx] = Element(
                            id=f"id_{node_id}_{idx}" if node_id else f"id_{idx}",
                            type="table",
                            element=element.element,
                            table=table,
                        )
                    else:
                        # for non-perfect tables, we will store the raw text
                        # and give it a different type to differentiate it from perfect tables
                        elements[idx] = Element(
                            id=f"id_{node_id}_{idx}" if node_id else f"id_{idx}",
                            type="table_text",
                            element=element.element,
                            # table=table
                        )
                else:
                    elements[idx] = Element(
                        id=f"id_{node_id}_{idx}" if node_id else f"id_{idx}",
                        type="text",
                        element=element.element,
                    )
            else:
                # if the element is not a table, keep it as to text
                elements[idx] = Element(
                    id=f"id_{node_id}_{idx}" if node_id else f"id_{idx}",
                    type="text",
                    element=element.element,
                )

        # merge consecutive text elements together for now
        merged_elements: List[Element] = []
        for element in elements:
            if (
                len(merged_elements) > 0
                and element.type == "text"
                and merged_elements[-1].type == "text"
            ):
                merged_elements[-1].element += "\n" + element.element
            else:
                merged_elements.append(element)
        elements = merged_elements
        return merged_elements

    def filter_table(self, table_element: Any) -> bool:
        """Filter tables."""
        table_df = md_to_df(table_element.element)

        # check if table_df is not None, has more than one row, and more than one column
        return table_df is not None and not table_df.empty and len(table_df.columns) > 1

get_nodes_from_node #

get_nodes_from_node(node: TextNode) -> List[BaseNode]

从节点获取节点。

workflows/handler.py 中的源代码llama_index/core/node_parser/relational/markdown_element.py

def get_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
    """Get nodes from node."""
    elements = self.extract_elements(
        node.get_content(), table_filters=[self.filter_table], node_id=node.node_id
    )
    elements = self.extract_html_tables(elements)
    table_elements = self.get_table_elements(elements)
    # extract summaries over table elements
    self.extract_table_summaries(table_elements)
    # convert into nodes
    # will return a list of Nodes and Index Nodes
    nodes = self.get_nodes_from_elements(
        elements, node, ref_doc_text=node.get_content()
    )
    source_document = node.source_node or node.as_related_node_info()
    for n in nodes:
        n.relationships[NodeRelationship.SOURCE] = source_document
        n.metadata.update(node.metadata)
    return nodes

aget_nodes_from_node `async` #

aget_nodes_from_node(node: TextNode) -> List[BaseNode]

从节点获取节点。

workflows/handler.py 中的源代码llama_index/core/node_parser/relational/markdown_element.py

async def aget_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
    """Get nodes from node."""
    elements = self.extract_elements(
        node.get_content(), table_filters=[self.filter_table], node_id=node.node_id
    )
    table_elements = self.get_table_elements(elements)
    # extract summaries over table elements
    await self.aextract_table_summaries(table_elements)
    # convert into nodes
    # will return a list of Nodes and Index Nodes
    nodes = self.get_nodes_from_elements(
        elements, node, ref_doc_text=node.get_content()
    )
    source_document = node.source_node or node.as_related_node_info()
    for n in nodes:
        n.relationships[NodeRelationship.SOURCE] = source_document
        n.metadata.update(node.metadata)
    return nodes

extract_html_tables #

extract_html_tables(elements: List[Element]) -> List[Element]

从文本中提取HTML表格。

返回：

类型	描述
`List[Element]`	列表[元素]：按表格文本元素分割的文本元素

workflows/handler.py 中的源代码llama_index/core/node_parser/relational/markdown_element.py

def extract_html_tables(self, elements: List[Element]) -> List[Element]:
    """
    Extract html tables from text.

    Returns:
        List[Element]: text elements split by table_text element

    """
    new_elements = []
    for element in elements:
        if element.type != "text":
            # skip when it is not text
            new_elements.append(element)
            continue
        else:
            text = element.element
            last_pos = 0
            i = 0
            n = len(text)

            while i < n:
                table_start = text.find("<table>", i)
                if table_start == -1:
                    break

                table_end = text.find("</table>", table_start)
                if table_end - table_start <= 7:
                    # not a valid <table></table>
                    break

                # extract text before the table
                pre_text = text[last_pos:table_start].strip()
                if pre_text:
                    new_elements.append(
                        Element(
                            id=f"{element.id}_{len(new_elements)}",
                            type="text",
                            element=pre_text,
                        )
                    )

                # extract the html table
                table_content = text[
                    table_start : table_end + 8
                ]  # 8 is length of </table>
                new_elements.append(
                    Element(
                        id=f"{element.id}_{len(new_elements)}",
                        type="table_text",
                        element=table_content,
                    )
                )

                last_pos = table_end + 8
                i = last_pos

            # add the last piece of text
            final_text = text[last_pos:].strip()
            if final_text:
                new_elements.append(
                    Element(
                        id=f"{element.id}_{len(new_elements)}",
                        type="text",
                        element=final_text,
                    )
                )

    return new_elements

extract_elements #

extract_elements(text: str, node_id: Optional[str] = None, table_filters: Optional[List[Callable]] = None, **kwargs: Any) -> List[Element]

从文本中提取元素。

workflows/handler.py 中的源代码llama_index/core/node_parser/relational/markdown_element.py

def extract_elements(
    self,
    text: str,
    node_id: Optional[str] = None,
    table_filters: Optional[List[Callable]] = None,
    **kwargs: Any,
) -> List[Element]:
    # get node id for each node so that we can avoid using the same id for different nodes
    """Extract elements from text."""
    lines = text.split("\n")
    currentElement = None

    elements: List[Element] = []
    # Then parse the lines
    for line in lines:
        if line.startswith("```"):
            # check if this is the end of a code block
            if currentElement is not None and currentElement.type == "code":
                elements.append(currentElement)
                currentElement = None
                # if there is some text after the ``` create a text element with it
                if len(line) > 3:
                    elements.append(
                        Element(
                            id=f"id_{len(elements)}",
                            type="text",
                            element=line.lstrip("```"),
                        )
                    )

            elif line.count("```") == 2 and line[-3] != "`":
                # check if inline code block (aka have a second ``` in line but not at the end)
                if currentElement is not None:
                    elements.append(currentElement)
                currentElement = Element(
                    id=f"id_{len(elements)}",
                    type="code",
                    element=line.lstrip("```"),
                )
            elif currentElement is not None and currentElement.type == "text":
                currentElement.element += "\n" + line
            else:
                if currentElement is not None:
                    elements.append(currentElement)
                currentElement = Element(
                    id=f"id_{len(elements)}", type="text", element=line
                )
        elif currentElement is not None and currentElement.type == "code":
            currentElement.element += "\n" + line

        elif line.startswith("|"):
            if currentElement is not None and currentElement.type != "table":
                if currentElement is not None:
                    elements.append(currentElement)
                currentElement = Element(
                    id=f"id_{len(elements)}", type="table", element=line
                )
            elif currentElement is not None:
                currentElement.element += "\n" + line
            else:
                currentElement = Element(
                    id=f"id_{len(elements)}", type="table", element=line
                )
        elif line.startswith("#"):
            if currentElement is not None:
                elements.append(currentElement)
            currentElement = Element(
                id=f"id_{len(elements)}",
                type="title",
                element=line.lstrip("#"),
                title_level=len(line) - len(line.lstrip("#")),
            )
        else:
            if currentElement is not None and currentElement.type != "text":
                elements.append(currentElement)
                currentElement = Element(
                    id=f"id_{len(elements)}", type="text", element=line
                )
            elif currentElement is not None:
                currentElement.element += "\n" + line
            else:
                currentElement = Element(
                    id=f"id_{len(elements)}", type="text", element=line
                )
    if currentElement is not None:
        elements.append(currentElement)

    for idx, element in enumerate(elements):
        if element.type == "table":
            should_keep = True
            perfect_table = True

            # verify that the table (markdown) have the same number of columns on each rows
            table_lines = element.element.split("\n")
            table_columns = [len(line.split("|")) for line in table_lines]
            if len(set(table_columns)) > 1:
                # if the table have different number of columns on each rows, it's not a perfect table
                # we will store the raw text for such tables instead of converting them to a dataframe
                perfect_table = False

            # verify that the table (markdown) have at least 2 rows
            if len(table_lines) < 2:
                should_keep = False

            # apply the table filter, now only filter empty tables
            if should_keep and perfect_table and table_filters is not None:
                should_keep = all(tf(element) for tf in table_filters)

            # if the element is a table, convert it to a dataframe
            if should_keep:
                if perfect_table:
                    table = md_to_df(element.element)

                    elements[idx] = Element(
                        id=f"id_{node_id}_{idx}" if node_id else f"id_{idx}",
                        type="table",
                        element=element.element,
                        table=table,
                    )
                else:
                    # for non-perfect tables, we will store the raw text
                    # and give it a different type to differentiate it from perfect tables
                    elements[idx] = Element(
                        id=f"id_{node_id}_{idx}" if node_id else f"id_{idx}",
                        type="table_text",
                        element=element.element,
                        # table=table
                    )
            else:
                elements[idx] = Element(
                    id=f"id_{node_id}_{idx}" if node_id else f"id_{idx}",
                    type="text",
                    element=element.element,
                )
        else:
            # if the element is not a table, keep it as to text
            elements[idx] = Element(
                id=f"id_{node_id}_{idx}" if node_id else f"id_{idx}",
                type="text",
                element=element.element,
            )

    # merge consecutive text elements together for now
    merged_elements: List[Element] = []
    for element in elements:
        if (
            len(merged_elements) > 0
            and element.type == "text"
            and merged_elements[-1].type == "text"
        ):
            merged_elements[-1].element += "\n" + element.element
        else:
            merged_elements.append(element)
    elements = merged_elements
    return merged_elements

filter_table #

filter_table(table_element: Any) -> bool

筛选表格。

workflows/handler.py 中的源代码llama_index/core/node_parser/relational/markdown_element.py

def filter_table(self, table_element: Any) -> bool:
    """Filter tables."""
    table_df = md_to_df(table_element.element)

    # check if table_df is not None, has more than one row, and more than one column
    return table_df is not None and not table_df.empty and len(table_df.columns) > 1

非结构化元素节点解析器 #

基类：EventBaseElementNodeParser

非结构化元素节点解析器。

将文档分割为文本节点和对应于嵌入对象（例如表格）的索引节点。

参数：

名称	类型	描述	默认
`partitioning_parameters`	`Dict[str, Any] \| None`	表示分区过程参数的额外字典。	`{}`

workflows/handler.py 中的源代码llama_index/core/node_parser/relational/unstructured_element.py

class UnstructuredElementNodeParser(BaseElementNodeParser):
    """
    Unstructured element node parser.

    Splits a document into Text Nodes and Index Nodes corresponding to embedded objects
    (e.g. tables).

    """

    partitioning_parameters: Optional[Dict[str, Any]] = Field(
        default={},
        description="Extra dictionary representing parameters of the partitioning process.",
    )

    def __init__(
        self,
        callback_manager: Optional[CallbackManager] = None,
        llm: Optional[Any] = None,
        summary_query_str: str = DEFAULT_SUMMARY_QUERY_STR,
        partitioning_parameters: Optional[Dict[str, Any]] = {},
    ) -> None:
        """Initialize."""
        try:
            import lxml  # noqa  # pants: no-infer-dep
            import unstructured  # noqa  # pants: no-infer-dep
        except ImportError:
            raise ImportError(
                "You must install the `unstructured` and `lxml` "
                "package to use this node parser."
            )
        callback_manager = callback_manager or CallbackManager([])

        return super().__init__(
            callback_manager=callback_manager,
            llm=llm,
            summary_query_str=summary_query_str,
            partitioning_parameters=partitioning_parameters,
        )

    @classmethod
    def class_name(cls) -> str:
        return "UnstructuredElementNodeParser"

    def get_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
        """Get nodes from node."""
        elements = self.extract_elements(
            node.get_content(), table_filters=[self.filter_table]
        )
        table_elements = self.get_table_elements(elements)
        # extract summaries over table elements
        self.extract_table_summaries(table_elements)
        # convert into nodes
        # will return a list of Nodes and Index Nodes
        nodes = self.get_nodes_from_elements(
            elements, node, ref_doc_text=node.get_content()
        )

        source_document = node.source_node or node.as_related_node_info()
        for n in nodes:
            n.relationships[NodeRelationship.SOURCE] = source_document
            n.metadata.update(node.metadata)
        return nodes

    async def aget_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
        """Get nodes from node."""
        elements = self.extract_elements(
            node.get_content(), table_filters=[self.filter_table]
        )
        table_elements = self.get_table_elements(elements)
        # extract summaries over table elements
        await self.aextract_table_summaries(table_elements)
        # convert into nodes
        # will return a list of Nodes and Index Nodes
        nodes = self.get_nodes_from_elements(
            elements, node, ref_doc_text=node.get_content()
        )

        source_document = node.source_node or node.as_related_node_info()
        for n in nodes:
            n.relationships[NodeRelationship.SOURCE] = source_document
            n.metadata.update(node.metadata)
        return nodes

    def extract_elements(
        self, text: str, table_filters: Optional[List[Callable]] = None, **kwargs: Any
    ) -> List[Element]:
        """Extract elements from text."""
        from unstructured.partition.html import partition_html  # pants: no-infer-dep

        table_filters = table_filters or []
        partitioning_parameters = self.partitioning_parameters or {}
        elements = partition_html(text=text, **partitioning_parameters)
        output_els = []
        for idx, element in enumerate(elements):
            if "unstructured.documents.elements.Table" in str(type(element)):
                should_keep = all(tf(element) for tf in table_filters)
                if should_keep:
                    table_df = html_to_df(str(element.metadata.text_as_html))
                    output_els.append(
                        Element(
                            id=f"id_{idx}",
                            type="table",
                            element=element,
                            table=table_df,
                        )
                    )
                else:
                    # if not a table, keep it as Text as we don't want to lose context
                    from unstructured.documents.elements import Text

                    new_element = Text(str(element))
                    output_els.append(
                        Element(id=f"id_{idx}", type="text", element=new_element)
                    )
            else:
                output_els.append(Element(id=f"id_{idx}", type="text", element=element))
        return output_els

    def filter_table(self, table_element: Any) -> bool:
        """Filter tables."""
        table_df = html_to_df(table_element.metadata.text_as_html)

        # check if table_df is not None, has more than one row, and more than one column
        return table_df is not None and not table_df.empty and len(table_df.columns) > 1

get_nodes_from_node #

get_nodes_from_node(node: TextNode) -> List[BaseNode]

从节点获取节点。

workflows/handler.py 中的源代码llama_index/core/node_parser/relational/unstructured_element.py

def get_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
    """Get nodes from node."""
    elements = self.extract_elements(
        node.get_content(), table_filters=[self.filter_table]
    )
    table_elements = self.get_table_elements(elements)
    # extract summaries over table elements
    self.extract_table_summaries(table_elements)
    # convert into nodes
    # will return a list of Nodes and Index Nodes
    nodes = self.get_nodes_from_elements(
        elements, node, ref_doc_text=node.get_content()
    )

    source_document = node.source_node or node.as_related_node_info()
    for n in nodes:
        n.relationships[NodeRelationship.SOURCE] = source_document
        n.metadata.update(node.metadata)
    return nodes

aget_nodes_from_node `async` #

aget_nodes_from_node(node: TextNode) -> List[BaseNode]

从节点获取节点。

workflows/handler.py 中的源代码llama_index/core/node_parser/relational/unstructured_element.py

async def aget_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
    """Get nodes from node."""
    elements = self.extract_elements(
        node.get_content(), table_filters=[self.filter_table]
    )
    table_elements = self.get_table_elements(elements)
    # extract summaries over table elements
    await self.aextract_table_summaries(table_elements)
    # convert into nodes
    # will return a list of Nodes and Index Nodes
    nodes = self.get_nodes_from_elements(
        elements, node, ref_doc_text=node.get_content()
    )

    source_document = node.source_node or node.as_related_node_info()
    for n in nodes:
        n.relationships[NodeRelationship.SOURCE] = source_document
        n.metadata.update(node.metadata)
    return nodes

extract_elements #

extract_elements(text: str, table_filters: Optional[List[Callable]] = None, **kwargs: Any) -> List[Element]

从文本中提取元素。

workflows/handler.py 中的源代码llama_index/core/node_parser/relational/unstructured_element.py

def extract_elements(
    self, text: str, table_filters: Optional[List[Callable]] = None, **kwargs: Any
) -> List[Element]:
    """Extract elements from text."""
    from unstructured.partition.html import partition_html  # pants: no-infer-dep

    table_filters = table_filters or []
    partitioning_parameters = self.partitioning_parameters or {}
    elements = partition_html(text=text, **partitioning_parameters)
    output_els = []
    for idx, element in enumerate(elements):
        if "unstructured.documents.elements.Table" in str(type(element)):
            should_keep = all(tf(element) for tf in table_filters)
            if should_keep:
                table_df = html_to_df(str(element.metadata.text_as_html))
                output_els.append(
                    Element(
                        id=f"id_{idx}",
                        type="table",
                        element=element,
                        table=table_df,
                    )
                )
            else:
                # if not a table, keep it as Text as we don't want to lose context
                from unstructured.documents.elements import Text

                new_element = Text(str(element))
                output_els.append(
                    Element(id=f"id_{idx}", type="text", element=new_element)
                )
        else:
            output_els.append(Element(id=f"id_{idx}", type="text", element=element))
    return output_els

filter_table #

filter_table(table_element: Any) -> bool

筛选表格。

workflows/handler.py 中的源代码llama_index/core/node_parser/relational/unstructured_element.py

def filter_table(self, table_element: Any) -> bool:
    """Filter tables."""
    table_df = html_to_df(table_element.metadata.text_as_html)

    # check if table_df is not None, has more than one row, and more than one column
    return table_df is not None and not table_df.empty and len(table_df.columns) > 1

Llama解析JSON节点解析器 #

基类：EventBaseElementNodeParser

Llama Parse Json 格式元素节点解析器。

将来自LlamaParse的JSON格式文档拆分为文本节点和索引节点，对应嵌入对象（例如表格）。

workflows/handler.py 中的源代码llama_index/core/node_parser/relational/llama_parse_json_element.py

class LlamaParseJsonNodeParser(BaseElementNodeParser):
    """
    Llama Parse Json format element node parser.

    Splits a json format document from LlamaParse into Text Nodes and Index Nodes
    corresponding to embedded objects (e.g. tables).

    """

    @classmethod
    def class_name(cls) -> str:
        return "LlamaParseJsonNodeParser"

    def get_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
        """Get nodes from node."""
        elements = self.extract_elements(
            node.get_content(),
            table_filters=[self.filter_table],
            node_id=node.id_,
            node_metadata=node.metadata,
        )
        table_elements = self.get_table_elements(elements)
        # extract summaries over table elements
        self.extract_table_summaries(table_elements)
        # convert into nodes
        # will return a list of Nodes and Index Nodes
        return self.get_nodes_from_elements(
            elements, node, ref_doc_text=node.get_content()
        )

    async def aget_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
        """Get nodes from node."""
        elements = self.extract_elements(
            node.get_content(),
            table_filters=[self.filter_table],
            node_id=node.id_,
            node_metadata=node.metadata,
        )
        table_elements = self.get_table_elements(elements)
        # extract summaries over table elements
        await self.aextract_table_summaries(table_elements)
        # convert into nodes
        # will return a list of Nodes and Index Nodes
        return self.get_nodes_from_elements(
            elements, node, ref_doc_text=node.get_content()
        )

    def extract_elements(
        self,
        text: str,
        mode: Optional[str] = "json",
        node_id: Optional[str] = None,
        node_metadata: Optional[Dict[str, Any]] = None,
        table_filters: Optional[List[Callable]] = None,
        **kwargs: Any,
    ) -> List[Element]:
        # get node id for each node so that we can avoid using the same id for different nodes
        """
        Extract elements from json based nodes.

        Args:
            text: node's text content
            mode: different modes for returning different types of elements based on the selected mode
            node_id: unique id for the node
            node_metadata: metadata for the node. the json output for the nodes contains a lot of fields for elements

        """
        elements: List[Element] = []
        currentElement = None
        page_number = node_metadata.get("page") if node_metadata is not None else 0

        if mode == "json" and node_metadata is not None:
            json_items = node_metadata.get("items") or []
            for element_idx, json_item in enumerate(json_items):
                ele_type = json_item.get("type")
                if ele_type == "heading":
                    elements.append(
                        Element(
                            id=f"id_page_{page_number}_heading_{element_idx}",
                            type="heading",
                            title_level=json_item.get("lvl"),
                            element=json_item.get("value"),
                            markdown=json_item.get("md"),
                            page_number=page_number,
                        )
                    )
                elif ele_type == "text":
                    elements.append(
                        Element(
                            id=f"id_page_{page_number}_text_{element_idx}",
                            type="text",
                            element=json_item.get("value"),
                            markdown=json_item.get("md"),
                            page_number=page_number,
                        )
                    )
                elif ele_type == "table":
                    elements.append(
                        Element(
                            id=f"id_page_{page_number}_table_{element_idx}",
                            type="table",
                            element=json_item.get("rows"),
                            markdown=json_item.get("md"),
                            page_number=page_number,
                        )
                    )
        elif mode == "images" and node_metadata is not None:
            # only get images from json metadata
            images = node_metadata.get("images") or []
            for idx, image in enumerate(images):
                elements.append(
                    Element(
                        id=f"id_page_{page_number}_image_{idx}",
                        type="image",
                        element=image,
                    )
                )
        else:
            lines = text.split("\n")
            # Then parse the lines from raw text of json
            for line in lines:
                if line.startswith("```"):
                    # check if this is the end of a code block
                    if currentElement is not None and currentElement.type == "code":
                        elements.append(currentElement)
                        currentElement = None
                        # if there is some text after the ``` create a text element with it
                        if len(line) > 3:
                            elements.append(
                                Element(
                                    id=f"id_{len(elements)}",
                                    type="text",
                                    element=line.lstrip("```"),
                                )
                            )

                    elif line.count("```") == 2 and line[-3] != "`":
                        # check if inline code block (aka have a second ``` in line but not at the end)
                        if currentElement is not None:
                            elements.append(currentElement)
                        currentElement = Element(
                            id=f"id_{len(elements)}",
                            type="code",
                            element=line.lstrip("```"),
                        )
                    elif currentElement is not None and currentElement.type == "text":
                        currentElement.element += "\n" + line
                    else:
                        if currentElement is not None:
                            elements.append(currentElement)
                        currentElement = Element(
                            id=f"id_{len(elements)}", type="text", element=line
                        )

                elif currentElement is not None and currentElement.type == "code":
                    currentElement.element += "\n" + line

                elif line.startswith("|"):
                    if currentElement is not None and currentElement.type != "table":
                        if currentElement is not None:
                            elements.append(currentElement)
                        currentElement = Element(
                            id=f"id_{len(elements)}", type="table", element=line
                        )
                    elif currentElement is not None:
                        currentElement.element += "\n" + line
                    else:
                        currentElement = Element(
                            id=f"id_{len(elements)}", type="table", element=line
                        )
                elif line.startswith("#"):
                    if currentElement is not None:
                        elements.append(currentElement)
                    currentElement = Element(
                        id=f"id_{len(elements)}",
                        type="title",
                        element=line.lstrip("#"),
                        title_level=len(line) - len(line.lstrip("#")),
                    )
                else:
                    if currentElement is not None and currentElement.type != "text":
                        elements.append(currentElement)
                        currentElement = Element(
                            id=f"id_{len(elements)}", type="text", element=line
                        )
                    elif currentElement is not None:
                        currentElement.element += "\n" + line
                    else:
                        currentElement = Element(
                            id=f"id_{len(elements)}", type="text", element=line
                        )
        if currentElement is not None:
            elements.append(currentElement)

        for idx, element in enumerate(elements):
            if element.type == "table":
                assert element.markdown is not None

                should_keep = True
                perfect_table = True

                # verify that the table (markdown) have the same number of columns on each rows
                table_lines = element.markdown.split("\n")
                table_columns = [len(line.split("|")) for line in table_lines]
                if len(set(table_columns)) > 1:
                    # if the table have different number of columns on each rows, it's not a perfect table
                    # we will store the raw text for such tables instead of converting them to a dataframe
                    perfect_table = False

                # verify that the table (markdown) have at least 2 rows
                if len(table_lines) < 2:
                    should_keep = False

                # apply the table filter, now only filter empty tables
                if should_keep and perfect_table and table_filters is not None:
                    should_keep = all(tf(element) for tf in table_filters)

                # if the element is a table, convert it to a dataframe
                if should_keep:
                    if perfect_table:
                        assert element.markdown is not None
                        table = md_to_df(element.markdown)

                        elements[idx] = Element(
                            id=(
                                f"id_page_{page_number}_{node_id}_{idx}"
                                if node_id
                                else f"id_{idx}"
                            ),
                            type="table",
                            element=element,
                            table=table,
                        )
                    else:
                        # for non-perfect tables, we will store the raw text
                        # and give it a different type to differentiate it from perfect tables
                        elements[idx] = Element(
                            id=(
                                f"id_page_{page_number}_{node_id}_{idx}"
                                if node_id
                                else f"id_{idx}"
                            ),
                            type="table_text",
                            element=element.element,
                            # table=table
                        )
                else:
                    elements[idx] = Element(
                        id=(
                            f"id_page_{page_number}_{node_id}_{idx}"
                            if node_id
                            else f"id_page_{page_number}_{idx}"
                        ),
                        type="text",
                        element=element.element,
                    )
            else:
                # if the element is not a table, keep it as to text
                elements[idx] = Element(
                    id=(
                        f"id_page_{page_number}_{node_id}_{idx}"
                        if node_id
                        else f"id_page_{page_number}_{idx}"
                    ),
                    type="text",
                    element=element.element,
                )

        # merge consecutive text elements together for now
        merged_elements: List[Element] = []
        for element in elements:
            if (
                len(merged_elements) > 0
                and element.type == "text"
                and merged_elements[-1].type == "text"
            ):
                if isinstance(element.element, list):
                    merged_elements[-1].element += "\n" + " ".join(
                        str(e) for e in element.element
                    )
                else:
                    merged_elements[-1].element += "\n" + element.element
            else:
                merged_elements.append(element)
        elements = merged_elements
        return merged_elements

    def filter_table(self, table_element: Any) -> bool:
        """Filter tables."""
        # convert markdown of the table to df
        table_df = md_to_df(table_element.markdown)

        # check if table_df is not None, has more than one row, and more than one column
        return table_df is not None and not table_df.empty and len(table_df.columns) > 1

get_nodes_from_node #

get_nodes_from_node(node: TextNode) -> List[BaseNode]

从节点获取节点。

workflows/handler.py 中的源代码llama_index/core/node_parser/relational/llama_parse_json_element.py

def get_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
    """Get nodes from node."""
    elements = self.extract_elements(
        node.get_content(),
        table_filters=[self.filter_table],
        node_id=node.id_,
        node_metadata=node.metadata,
    )
    table_elements = self.get_table_elements(elements)
    # extract summaries over table elements
    self.extract_table_summaries(table_elements)
    # convert into nodes
    # will return a list of Nodes and Index Nodes
    return self.get_nodes_from_elements(
        elements, node, ref_doc_text=node.get_content()
    )

aget_nodes_from_node `async` #

aget_nodes_from_node(node: TextNode) -> List[BaseNode]

从节点获取节点。

workflows/handler.py 中的源代码llama_index/core/node_parser/relational/llama_parse_json_element.py

async def aget_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
    """Get nodes from node."""
    elements = self.extract_elements(
        node.get_content(),
        table_filters=[self.filter_table],
        node_id=node.id_,
        node_metadata=node.metadata,
    )
    table_elements = self.get_table_elements(elements)
    # extract summaries over table elements
    await self.aextract_table_summaries(table_elements)
    # convert into nodes
    # will return a list of Nodes and Index Nodes
    return self.get_nodes_from_elements(
        elements, node, ref_doc_text=node.get_content()
    )

extract_elements #

extract_elements(text: str, mode: Optional[str] = 'json', node_id: Optional[str] = None, node_metadata: Optional[Dict[str, Any]] = None, table_filters: Optional[List[Callable]] = None, **kwargs: Any) -> List[Element]

基于JSON节点提取元素。

参数：

名称	类型	描述	默认
`text`	`str`	节点的文本内容	required
`mode`	`Optional[str]`	基于所选模式返回不同类型元素的不同模式	`'json'`
`node_id`	`Optional[str]`	节点的唯一标识符	`None`
`node_metadata`	`Optional[Dict[str, Any]]`	节点的元数据。节点的 JSON 输出包含许多元素字段	`None`

workflows/handler.py 中的源代码llama_index/core/node_parser/relational/llama_parse_json_element.py

def extract_elements(
    self,
    text: str,
    mode: Optional[str] = "json",
    node_id: Optional[str] = None,
    node_metadata: Optional[Dict[str, Any]] = None,
    table_filters: Optional[List[Callable]] = None,
    **kwargs: Any,
) -> List[Element]:
    # get node id for each node so that we can avoid using the same id for different nodes
    """
    Extract elements from json based nodes.

    Args:
        text: node's text content
        mode: different modes for returning different types of elements based on the selected mode
        node_id: unique id for the node
        node_metadata: metadata for the node. the json output for the nodes contains a lot of fields for elements

    """
    elements: List[Element] = []
    currentElement = None
    page_number = node_metadata.get("page") if node_metadata is not None else 0

    if mode == "json" and node_metadata is not None:
        json_items = node_metadata.get("items") or []
        for element_idx, json_item in enumerate(json_items):
            ele_type = json_item.get("type")
            if ele_type == "heading":
                elements.append(
                    Element(
                        id=f"id_page_{page_number}_heading_{element_idx}",
                        type="heading",
                        title_level=json_item.get("lvl"),
                        element=json_item.get("value"),
                        markdown=json_item.get("md"),
                        page_number=page_number,
                    )
                )
            elif ele_type == "text":
                elements.append(
                    Element(
                        id=f"id_page_{page_number}_text_{element_idx}",
                        type="text",
                        element=json_item.get("value"),
                        markdown=json_item.get("md"),
                        page_number=page_number,
                    )
                )
            elif ele_type == "table":
                elements.append(
                    Element(
                        id=f"id_page_{page_number}_table_{element_idx}",
                        type="table",
                        element=json_item.get("rows"),
                        markdown=json_item.get("md"),
                        page_number=page_number,
                    )
                )
    elif mode == "images" and node_metadata is not None:
        # only get images from json metadata
        images = node_metadata.get("images") or []
        for idx, image in enumerate(images):
            elements.append(
                Element(
                    id=f"id_page_{page_number}_image_{idx}",
                    type="image",
                    element=image,
                )
            )
    else:
        lines = text.split("\n")
        # Then parse the lines from raw text of json
        for line in lines:
            if line.startswith("```"):
                # check if this is the end of a code block
                if currentElement is not None and currentElement.type == "code":
                    elements.append(currentElement)
                    currentElement = None
                    # if there is some text after the ``` create a text element with it
                    if len(line) > 3:
                        elements.append(
                            Element(
                                id=f"id_{len(elements)}",
                                type="text",
                                element=line.lstrip("```"),
                            )
                        )

                elif line.count("```") == 2 and line[-3] != "`":
                    # check if inline code block (aka have a second ``` in line but not at the end)
                    if currentElement is not None:
                        elements.append(currentElement)
                    currentElement = Element(
                        id=f"id_{len(elements)}",
                        type="code",
                        element=line.lstrip("```"),
                    )
                elif currentElement is not None and currentElement.type == "text":
                    currentElement.element += "\n" + line
                else:
                    if currentElement is not None:
                        elements.append(currentElement)
                    currentElement = Element(
                        id=f"id_{len(elements)}", type="text", element=line
                    )

            elif currentElement is not None and currentElement.type == "code":
                currentElement.element += "\n" + line

            elif line.startswith("|"):
                if currentElement is not None and currentElement.type != "table":
                    if currentElement is not None:
                        elements.append(currentElement)
                    currentElement = Element(
                        id=f"id_{len(elements)}", type="table", element=line
                    )
                elif currentElement is not None:
                    currentElement.element += "\n" + line
                else:
                    currentElement = Element(
                        id=f"id_{len(elements)}", type="table", element=line
                    )
            elif line.startswith("#"):
                if currentElement is not None:
                    elements.append(currentElement)
                currentElement = Element(
                    id=f"id_{len(elements)}",
                    type="title",
                    element=line.lstrip("#"),
                    title_level=len(line) - len(line.lstrip("#")),
                )
            else:
                if currentElement is not None and currentElement.type != "text":
                    elements.append(currentElement)
                    currentElement = Element(
                        id=f"id_{len(elements)}", type="text", element=line
                    )
                elif currentElement is not None:
                    currentElement.element += "\n" + line
                else:
                    currentElement = Element(
                        id=f"id_{len(elements)}", type="text", element=line
                    )
    if currentElement is not None:
        elements.append(currentElement)

    for idx, element in enumerate(elements):
        if element.type == "table":
            assert element.markdown is not None

            should_keep = True
            perfect_table = True

            # verify that the table (markdown) have the same number of columns on each rows
            table_lines = element.markdown.split("\n")
            table_columns = [len(line.split("|")) for line in table_lines]
            if len(set(table_columns)) > 1:
                # if the table have different number of columns on each rows, it's not a perfect table
                # we will store the raw text for such tables instead of converting them to a dataframe
                perfect_table = False

            # verify that the table (markdown) have at least 2 rows
            if len(table_lines) < 2:
                should_keep = False

            # apply the table filter, now only filter empty tables
            if should_keep and perfect_table and table_filters is not None:
                should_keep = all(tf(element) for tf in table_filters)

            # if the element is a table, convert it to a dataframe
            if should_keep:
                if perfect_table:
                    assert element.markdown is not None
                    table = md_to_df(element.markdown)

                    elements[idx] = Element(
                        id=(
                            f"id_page_{page_number}_{node_id}_{idx}"
                            if node_id
                            else f"id_{idx}"
                        ),
                        type="table",
                        element=element,
                        table=table,
                    )
                else:
                    # for non-perfect tables, we will store the raw text
                    # and give it a different type to differentiate it from perfect tables
                    elements[idx] = Element(
                        id=(
                            f"id_page_{page_number}_{node_id}_{idx}"
                            if node_id
                            else f"id_{idx}"
                        ),
                        type="table_text",
                        element=element.element,
                        # table=table
                    )
            else:
                elements[idx] = Element(
                    id=(
                        f"id_page_{page_number}_{node_id}_{idx}"
                        if node_id
                        else f"id_page_{page_number}_{idx}"
                    ),
                    type="text",
                    element=element.element,
                )
        else:
            # if the element is not a table, keep it as to text
            elements[idx] = Element(
                id=(
                    f"id_page_{page_number}_{node_id}_{idx}"
                    if node_id
                    else f"id_page_{page_number}_{idx}"
                ),
                type="text",
                element=element.element,
            )

    # merge consecutive text elements together for now
    merged_elements: List[Element] = []
    for element in elements:
        if (
            len(merged_elements) > 0
            and element.type == "text"
            and merged_elements[-1].type == "text"
        ):
            if isinstance(element.element, list):
                merged_elements[-1].element += "\n" + " ".join(
                    str(e) for e in element.element
                )
            else:
                merged_elements[-1].element += "\n" + element.element
        else:
            merged_elements.append(element)
    elements = merged_elements
    return merged_elements

filter_table #

filter_table(table_element: Any) -> bool

筛选表格。

workflows/handler.py 中的源代码llama_index/core/node_parser/relational/llama_parse_json_element.py

def filter_table(self, table_element: Any) -> bool:
    """Filter tables."""
    # convert markdown of the table to df
    table_df = md_to_df(table_element.markdown)

    # check if table_df is not None, has more than one row, and more than one column
    return table_df is not None and not table_df.empty and len(table_df.columns) > 1

代码分割器 #

基类：EventTextSplitter

使用AST解析器拆分代码。

感谢 Kevin Lu / SweepAI 提出这个优雅的代码拆分解决方案。 https://docs.sweep.dev/blogs/chunking-2m-files

参数：

名称	类型	描述	默认
`language`	`str`	正在拆分的代码所使用的编程语言。	required
`chunk_lines`	`int`	每个块中包含的行数。	`40`
`chunk_lines_overlap`	`int`	每个代码块重叠的代码行数。	`15`
`max_chars`	`int`	每个分块的最大字符数。	`1500`

workflows/handler.py 中的源代码llama_index/core/node_parser/text/code.py

class CodeSplitter(TextSplitter):
    """
    Split code using a AST parser.

    Thank you to Kevin Lu / SweepAI for suggesting this elegant code splitting solution.
    https://docs.sweep.dev/blogs/chunking-2m-files
    """

    language: str = Field(
        description="The programming language of the code being split."
    )
    chunk_lines: int = Field(
        default=DEFAULT_CHUNK_LINES,
        description="The number of lines to include in each chunk.",
        gt=0,
    )
    chunk_lines_overlap: int = Field(
        default=DEFAULT_LINES_OVERLAP,
        description="How many lines of code each chunk overlaps with.",
        gt=0,
    )
    max_chars: int = Field(
        default=DEFAULT_MAX_CHARS,
        description="Maximum number of characters per chunk.",
        gt=0,
    )
    _parser: Any = PrivateAttr()

    def __init__(
        self,
        language: str,
        chunk_lines: int = DEFAULT_CHUNK_LINES,
        chunk_lines_overlap: int = DEFAULT_LINES_OVERLAP,
        max_chars: int = DEFAULT_MAX_CHARS,
        parser: Any = None,
        callback_manager: Optional[CallbackManager] = None,
        include_metadata: bool = True,
        include_prev_next_rel: bool = True,
        id_func: Optional[Callable[[int, Document], str]] = None,
    ) -> None:
        """Initialize a CodeSplitter."""
        from tree_sitter import Parser  # pants: no-infer-dep

        callback_manager = callback_manager or CallbackManager([])
        id_func = id_func or default_id_func

        super().__init__(
            language=language,
            chunk_lines=chunk_lines,
            chunk_lines_overlap=chunk_lines_overlap,
            max_chars=max_chars,
            callback_manager=callback_manager,
            include_metadata=include_metadata,
            include_prev_next_rel=include_prev_next_rel,
            id_func=id_func,
        )

        if parser is None:
            try:
                import tree_sitter_language_pack  # pants: no-infer-dep

                parser = tree_sitter_language_pack.get_parser(language)  # type: ignore
            except ImportError:
                raise ImportError(
                    "Please install tree_sitter_language_pack to use CodeSplitter."
                    "Or pass in a parser object."
                )
            except Exception:
                print(
                    f"Could not get parser for language {language}. Check "
                    "https://github.com/Goldziher/tree-sitter-language-pack?tab=readme-ov-file#available-languages "
                    "for a list of valid languages."
                )
                raise
        if not isinstance(parser, Parser):
            raise ValueError("Parser must be a tree-sitter Parser object.")

        self._parser = parser

    @classmethod
    def from_defaults(
        cls,
        language: str,
        chunk_lines: int = DEFAULT_CHUNK_LINES,
        chunk_lines_overlap: int = DEFAULT_LINES_OVERLAP,
        max_chars: int = DEFAULT_MAX_CHARS,
        callback_manager: Optional[CallbackManager] = None,
        parser: Any = None,
    ) -> "CodeSplitter":
        """Create a CodeSplitter with default values."""
        return cls(
            language=language,
            chunk_lines=chunk_lines,
            chunk_lines_overlap=chunk_lines_overlap,
            max_chars=max_chars,
            callback_manager=callback_manager,
            parser=parser,
        )

    @classmethod
    def class_name(cls) -> str:
        return "CodeSplitter"

    def _chunk_node(self, node: Any, text_bytes: bytes, last_end: int = 0) -> List[str]:
        """
        Recursively chunk a node into smaller pieces based on character limits.

        Args:
            node (Any): The AST node to chunk.
            text_bytes (bytes): The original source code text as bytes.
            last_end (int, optional): The ending position of the last processed chunk. Defaults to 0.

        Returns:
            List[str]: A list of code chunks that respect the max_chars limit.

        """
        new_chunks = []
        current_chunk = ""
        for child in node.children:
            if child.end_byte - child.start_byte > self.max_chars:
                # Child is too big, recursively chunk the child
                if len(current_chunk) > 0:
                    new_chunks.append(current_chunk)
                current_chunk = ""
                new_chunks.extend(self._chunk_node(child, text_bytes, last_end))
            elif (
                len(current_chunk) + child.end_byte - child.start_byte > self.max_chars
            ):
                # Child would make the current chunk too big, so start a new chunk
                new_chunks.append(current_chunk)
                current_chunk = text_bytes[last_end : child.end_byte].decode("utf-8")
            else:
                current_chunk += text_bytes[last_end : child.end_byte].decode("utf-8")
            last_end = child.end_byte
        if len(current_chunk) > 0:
            new_chunks.append(current_chunk)
        return new_chunks

    def split_text(self, text: str) -> List[str]:
        """
        Split incoming code into chunks using the AST parser.

        This method parses the input code into an AST and then chunks it while preserving
        syntactic structure. It handles error cases and ensures the code can be properly parsed.

        Args:
            text (str): The source code text to split.

        Returns:
            List[str]: A list of code chunks.

        Raises:
            ValueError: If the code cannot be parsed for the specified language.

        """
        """Split incoming code and return chunks using the AST."""
        with self.callback_manager.event(
            CBEventType.CHUNKING, payload={EventPayload.CHUNKS: [text]}
        ) as event:
            text_bytes = bytes(text, "utf-8")
            tree = self._parser.parse(text_bytes)

            if (
                not tree.root_node.children
                or tree.root_node.children[0].type != "ERROR"
            ):
                chunks = [
                    chunk.strip()
                    for chunk in self._chunk_node(tree.root_node, text_bytes)
                ]
                event.on_end(
                    payload={EventPayload.CHUNKS: chunks},
                )

                return chunks
            else:
                raise ValueError(f"Could not parse code with language {self.language}.")

from_defaults `classmethod` #

from_defaults(language: str, chunk_lines: int = DEFAULT_CHUNK_LINES, chunk_lines_overlap: int = DEFAULT_LINES_OVERLAP, max_chars: int = DEFAULT_MAX_CHARS, callback_manager: Optional[CallbackManager] = None, parser: Any = None) -> CodeSplitter

使用默认值创建一个代码分割器。

workflows/handler.py 中的源代码llama_index/core/node_parser/text/code.py

@classmethod
def from_defaults(
    cls,
    language: str,
    chunk_lines: int = DEFAULT_CHUNK_LINES,
    chunk_lines_overlap: int = DEFAULT_LINES_OVERLAP,
    max_chars: int = DEFAULT_MAX_CHARS,
    callback_manager: Optional[CallbackManager] = None,
    parser: Any = None,
) -> "CodeSplitter":
    """Create a CodeSplitter with default values."""
    return cls(
        language=language,
        chunk_lines=chunk_lines,
        chunk_lines_overlap=chunk_lines_overlap,
        max_chars=max_chars,
        callback_manager=callback_manager,
        parser=parser,
    )

split_text #

split_text(text: str) -> List[str]

使用抽象语法树解析器将传入的代码分割成块。

此方法将输入代码解析为抽象语法树，然后在保留语法结构的同时对其进行分块处理。它能够处理错误情况，并确保代码可以被正确解析。

参数：

名称	类型	描述	默认
`text`	`str`	要分割的源代码文本。	required

返回：

类型	描述
`List[str]`	List[str]: 代码块列表。

引发：

类型	描述
`ValueError`	如果无法解析指定语言的代码。

workflows/handler.py 中的源代码llama_index/core/node_parser/text/code.py

def split_text(self, text: str) -> List[str]:
    """
    Split incoming code into chunks using the AST parser.

    This method parses the input code into an AST and then chunks it while preserving
    syntactic structure. It handles error cases and ensures the code can be properly parsed.

    Args:
        text (str): The source code text to split.

    Returns:
        List[str]: A list of code chunks.

    Raises:
        ValueError: If the code cannot be parsed for the specified language.

    """
    """Split incoming code and return chunks using the AST."""
    with self.callback_manager.event(
        CBEventType.CHUNKING, payload={EventPayload.CHUNKS: [text]}
    ) as event:
        text_bytes = bytes(text, "utf-8")
        tree = self._parser.parse(text_bytes)

        if (
            not tree.root_node.children
            or tree.root_node.children[0].type != "ERROR"
        ):
            chunks = [
                chunk.strip()
                for chunk in self._chunk_node(tree.root_node, text_bytes)
            ]
            event.on_end(
                payload={EventPayload.CHUNKS: chunks},
            )

            return chunks
        else:
            raise ValueError(f"Could not parse code with language {self.language}.")

Langchain节点解析器 #

基类：EventTextSplitter

基于langchain文本分割器的基本封装。

待办：弄清楚如何让这个元数据感知。

workflows/handler.py 中的源代码llama_index/core/node_parser/text/langchain.py

class LangchainNodeParser(TextSplitter):
    """
    Basic wrapper around langchain's text splitter.

    TODO: Figure out how to make this metadata aware.
    """

    _lc_splitter: "LC_TextSplitter" = PrivateAttr()

    def __init__(
        self,
        lc_splitter: "LC_TextSplitter",
        callback_manager: Optional[CallbackManager] = None,
        include_metadata: bool = True,
        include_prev_next_rel: bool = True,
        id_func: Optional[Callable[[int, Document], str]] = None,
    ):
        """Initialize with parameters."""
        id_func = id_func or default_id_func

        super().__init__(
            callback_manager=callback_manager or CallbackManager(),
            include_metadata=include_metadata,
            include_prev_next_rel=include_prev_next_rel,
            id_func=id_func,
        )
        self._lc_splitter = lc_splitter

    def split_text(self, text: str) -> List[str]:
        """Split text into sentences."""
        return self._lc_splitter.split_text(text)

split_text #

split_text(text: str) -> List[str]

将文本分割成句子。

workflows/handler.py 中的源代码llama_index/core/node_parser/text/langchain.py

def split_text(self, text: str) -> List[str]:
    """Split text into sentences."""
    return self._lc_splitter.split_text(text)

语义分割节点解析器 #

基类：EventNodeParser

语义节点解析器。

将文档分割为节点，每个节点是一组语义相关的句子。

参数：

名称	类型	描述	默认
`buffer_size`	`int`	评估语义相似度时组合在一起的句子数量	`1`
`embed_model`	`BaseEmbedding`	(BaseEmbedding): 要使用的嵌入模型	required
`sentence_splitter`	`Optional[Callable]`	将文本分割成句子	`<function split_by_sentence_tokenizer.<locals>.<lambda> at 0x7f82dcb320c0>`
`breakpoint_percentile_threshold`	`int`	用于创建语义断点的相异度阈值，较低的值将生成更多节点	`95`
`include_metadata`	`bool`	是否在节点中包含元数据	required
`include_prev_next_rel`	`bool`	是否包含上一页/下一页关系	required

workflows/handler.py 中的源代码llama_index/core/node_parser/text/semantic_splitter.py

class SemanticSplitterNodeParser(NodeParser):
    """
    Semantic node parser.

    Splits a document into Nodes, with each node being a group of semantically related sentences.

    Args:
        buffer_size (int): number of sentences to group together when evaluating semantic similarity
        embed_model: (BaseEmbedding): embedding model to use
        sentence_splitter (Optional[Callable]): splits text into sentences
        breakpoint_percentile_threshold (int): dissimilarity threshold for creating semantic breakpoints, lower value will generate more nodes
        include_metadata (bool): whether to include metadata in nodes
        include_prev_next_rel (bool): whether to include prev/next relationships

    """

    sentence_splitter: SentenceSplitterCallable = Field(
        default_factory=split_by_sentence_tokenizer,
        description="The text splitter to use when splitting documents.",
        exclude=True,
    )

    embed_model: SerializeAsAny[BaseEmbedding] = Field(
        description="The embedding model to use to for semantic comparison",
    )

    buffer_size: int = Field(
        default=1,
        description=(
            "The number of sentences to group together when evaluating semantic similarity. "
            "Set to 1 to consider each sentence individually. "
            "Set to >1 to group sentences together."
        ),
    )

    breakpoint_percentile_threshold: int = Field(
        default=95,
        description=(
            "The percentile of cosine dissimilarity that must be exceeded between a "
            "group of sentences and the next to form a node.  The smaller this "
            "number is, the more nodes will be generated"
        ),
    )

    @classmethod
    def class_name(cls) -> str:
        return "SemanticSplitterNodeParser"

    @classmethod
    def from_defaults(
        cls,
        embed_model: Optional[BaseEmbedding] = None,
        breakpoint_percentile_threshold: Optional[int] = 95,
        buffer_size: Optional[int] = 1,
        sentence_splitter: Optional[Callable[[str], List[str]]] = None,
        original_text_metadata_key: str = DEFAULT_OG_TEXT_METADATA_KEY,
        include_metadata: bool = True,
        include_prev_next_rel: bool = True,
        callback_manager: Optional[CallbackManager] = None,
        id_func: Optional[Callable[[int, Document], str]] = None,
    ) -> "SemanticSplitterNodeParser":
        callback_manager = callback_manager or CallbackManager([])

        sentence_splitter = sentence_splitter or split_by_sentence_tokenizer()
        if embed_model is None:
            try:
                from llama_index.embeddings.openai import (
                    OpenAIEmbedding,
                )  # pants: no-infer-dep

                embed_model = embed_model or OpenAIEmbedding()
            except ImportError:
                raise ImportError(
                    "`llama-index-embeddings-openai` package not found, "
                    "please run `pip install llama-index-embeddings-openai`"
                )

        id_func = id_func or default_id_func

        return cls(
            embed_model=embed_model,
            breakpoint_percentile_threshold=breakpoint_percentile_threshold,
            buffer_size=buffer_size,
            sentence_splitter=sentence_splitter,
            original_text_metadata_key=original_text_metadata_key,
            include_metadata=include_metadata,
            include_prev_next_rel=include_prev_next_rel,
            callback_manager=callback_manager,
            id_func=id_func,
        )

    def _parse_nodes(
        self,
        nodes: Sequence[BaseNode],
        show_progress: bool = False,
        **kwargs: Any,
    ) -> List[BaseNode]:
        """Parse document into nodes."""
        all_nodes: List[BaseNode] = []
        nodes_with_progress = get_tqdm_iterable(nodes, show_progress, "Parsing nodes")

        for node in nodes_with_progress:
            nodes = self.build_semantic_nodes_from_documents([node], show_progress)
            all_nodes.extend(nodes)

        return all_nodes

    async def _aparse_nodes(
        self,
        nodes: Sequence[BaseNode],
        show_progress: bool = False,
        **kwargs: Any,
    ) -> List[BaseNode]:
        """Asynchronously parse document into nodes."""
        all_nodes: List[BaseNode] = []
        nodes_with_progress = get_tqdm_iterable(nodes, show_progress, "Parsing nodes")

        for node in nodes_with_progress:
            nodes = await self.abuild_semantic_nodes_from_documents(
                [node], show_progress
            )
            all_nodes.extend(nodes)

        return all_nodes

    def build_semantic_nodes_from_documents(
        self,
        documents: Sequence[Document],
        show_progress: bool = False,
    ) -> List[BaseNode]:
        """Build window nodes from documents."""
        all_nodes: List[BaseNode] = []
        for doc in documents:
            text = doc.text
            text_splits = self.sentence_splitter(text)

            sentences = self._build_sentence_groups(text_splits)

            combined_sentence_embeddings = self.embed_model.get_text_embedding_batch(
                [s["combined_sentence"] for s in sentences],
                show_progress=show_progress,
            )

            for i, embedding in enumerate(combined_sentence_embeddings):
                sentences[i]["combined_sentence_embedding"] = embedding

            distances = self._calculate_distances_between_sentence_groups(sentences)

            chunks = self._build_node_chunks(sentences, distances)

            nodes = build_nodes_from_splits(
                chunks,
                doc,
                id_func=self.id_func,
            )

            all_nodes.extend(nodes)

        return all_nodes

    async def abuild_semantic_nodes_from_documents(
        self,
        documents: Sequence[Document],
        show_progress: bool = False,
    ) -> List[BaseNode]:
        """Asynchronously build window nodes from documents."""
        all_nodes: List[BaseNode] = []
        for doc in documents:
            text = doc.text
            text_splits = self.sentence_splitter(text)

            sentences = self._build_sentence_groups(text_splits)

            combined_sentence_embeddings = (
                await self.embed_model.aget_text_embedding_batch(
                    [s["combined_sentence"] for s in sentences],
                    show_progress=show_progress,
                )
            )

            for i, embedding in enumerate(combined_sentence_embeddings):
                sentences[i]["combined_sentence_embedding"] = embedding

            distances = self._calculate_distances_between_sentence_groups(sentences)

            chunks = self._build_node_chunks(sentences, distances)

            nodes = build_nodes_from_splits(
                chunks,
                doc,
                id_func=self.id_func,
            )

            all_nodes.extend(nodes)

        return all_nodes

    def _build_sentence_groups(
        self, text_splits: List[str]
    ) -> List[SentenceCombination]:
        sentences: List[SentenceCombination] = [
            {
                "sentence": x,
                "index": i,
                "combined_sentence": "",
                "combined_sentence_embedding": [],
            }
            for i, x in enumerate(text_splits)
        ]

        # Group sentences and calculate embeddings for sentence groups
        for i in range(len(sentences)):
            combined_sentence = ""

            for j in range(i - self.buffer_size, i):
                if j >= 0:
                    combined_sentence += sentences[j]["sentence"]

            combined_sentence += sentences[i]["sentence"]

            for j in range(i + 1, i + 1 + self.buffer_size):
                if j < len(sentences):
                    combined_sentence += sentences[j]["sentence"]

            sentences[i]["combined_sentence"] = combined_sentence

        return sentences

    def _calculate_distances_between_sentence_groups(
        self, sentences: List[SentenceCombination]
    ) -> List[float]:
        distances = []
        for i in range(len(sentences) - 1):
            embedding_current = sentences[i]["combined_sentence_embedding"]
            embedding_next = sentences[i + 1]["combined_sentence_embedding"]

            similarity = self.embed_model.similarity(embedding_current, embedding_next)

            distance = 1 - similarity

            distances.append(distance)

        return distances

    def _build_node_chunks(
        self, sentences: List[SentenceCombination], distances: List[float]
    ) -> List[str]:
        chunks = []
        if len(distances) > 0:
            breakpoint_distance_threshold = np.percentile(
                distances, self.breakpoint_percentile_threshold
            )

            indices_above_threshold = [
                i for i, x in enumerate(distances) if x > breakpoint_distance_threshold
            ]

            # Chunk sentences into semantic groups based on percentile breakpoints
            start_index = 0

            for index in indices_above_threshold:
                group = sentences[start_index : index + 1]
                combined_text = "".join([d["sentence"] for d in group])
                chunks.append(combined_text)

                start_index = index + 1

            if start_index < len(sentences):
                combined_text = "".join(
                    [d["sentence"] for d in sentences[start_index:]]
                )
                chunks.append(combined_text)
        else:
            # If, for some reason we didn't get any distances (i.e. very, very small documents) just
            # treat the whole document as a single node
            chunks = [" ".join([s["sentence"] for s in sentences])]

        return chunks

build_semantic_nodes_from_documents #

build_semantic_nodes_from_documents(documents: Sequence[文档], show_progress: bool = False) -> List[BaseNode]

从文档构建窗口节点。

workflows/handler.py 中的源代码llama_index/core/node_parser/text/semantic_splitter.py

def build_semantic_nodes_from_documents(
    self,
    documents: Sequence[Document],
    show_progress: bool = False,
) -> List[BaseNode]:
    """Build window nodes from documents."""
    all_nodes: List[BaseNode] = []
    for doc in documents:
        text = doc.text
        text_splits = self.sentence_splitter(text)

        sentences = self._build_sentence_groups(text_splits)

        combined_sentence_embeddings = self.embed_model.get_text_embedding_batch(
            [s["combined_sentence"] for s in sentences],
            show_progress=show_progress,
        )

        for i, embedding in enumerate(combined_sentence_embeddings):
            sentences[i]["combined_sentence_embedding"] = embedding

        distances = self._calculate_distances_between_sentence_groups(sentences)

        chunks = self._build_node_chunks(sentences, distances)

        nodes = build_nodes_from_splits(
            chunks,
            doc,
            id_func=self.id_func,
        )

        all_nodes.extend(nodes)

    return all_nodes

abuild_semantic_nodes_from_documents `async` #

abuild_semantic_nodes_from_documents(documents: Sequence[文档], show_progress: bool = False) -> List[BaseNode]

从文档异步构建窗口节点。

workflows/handler.py 中的源代码llama_index/core/node_parser/text/semantic_splitter.py

async def abuild_semantic_nodes_from_documents(
    self,
    documents: Sequence[Document],
    show_progress: bool = False,
) -> List[BaseNode]:
    """Asynchronously build window nodes from documents."""
    all_nodes: List[BaseNode] = []
    for doc in documents:
        text = doc.text
        text_splits = self.sentence_splitter(text)

        sentences = self._build_sentence_groups(text_splits)

        combined_sentence_embeddings = (
            await self.embed_model.aget_text_embedding_batch(
                [s["combined_sentence"] for s in sentences],
                show_progress=show_progress,
            )
        )

        for i, embedding in enumerate(combined_sentence_embeddings):
            sentences[i]["combined_sentence_embedding"] = embedding

        distances = self._calculate_distances_between_sentence_groups(sentences)

        chunks = self._build_node_chunks(sentences, distances)

        nodes = build_nodes_from_splits(
            chunks,
            doc,
            id_func=self.id_func,
        )

        all_nodes.extend(nodes)

    return all_nodes

语义双重合并分割器节点解析器 #

基类：EventNodeParser

语义双重合并文本分割器。

将文档分割为节点，每个节点是一组语义相关的句子。

参数：

名称	类型	描述	默认
`language_config`	`LanguageConfig`	选择要使用的语言和spacy语言模型	`<llama_index.core.node_parser.text.semantic_double_merging_splitter.LanguageConfig object at 0x7f82f02a3110>`
`initial_threshold`	`float`	设置初始化新分块的阈值	`0.6`
`appending_threshold`	`float`	设置将新句子附加到分块的阈值	`0.8`
`merging_threshold`	`float`	设置合并整个区块的阈值	`0.8`
`max_chunk_size`	`int`	分块的最大尺寸（以字符为单位）	`1000`
`merging_range`	`int`	如果相似，在最近邻之外要合并多少个'前方'块（可用1或2个）	`1`
`merging_separator`	`str`	合并数据块时使用的分隔符。默认为单个空格。	`' '`
`sentence_splitter`	`Optional[Callable]`	将文本分割成句子	`<function split_by_sentence_tokenizer.<locals>.<lambda> at 0x7f82dcb30c20>`

workflows/handler.py 中的源代码llama_index/core/node_parser/text/semantic_double_merging_splitter.py

class SemanticDoubleMergingSplitterNodeParser(NodeParser):
    """
    Semantic double merging text splitter.

    Splits a document into Nodes, with each node being a group of semantically related sentences.

    Args:
        language_config (LanguageConfig): chooses language and spacy language model to be used
        initial_threshold (float): sets threshold for initializing new chunk
        appending_threshold (float): sets threshold for appending new sentences to chunk
        merging_threshold (float): sets threshold for merging whole chunks
        max_chunk_size (int): maximum size of chunk (in characters)
        merging_range (int): How many chunks 'ahead' beyond the nearest neighbor to be merged if similar (1 or 2 available)
        merging_separator (str): The separator to use when merging chunks. Defaults to a single space.
        sentence_splitter (Optional[Callable]): splits text into sentences

    """

    language_config: LanguageConfig = Field(
        default=LanguageConfig(),
        description="Config that selects language and spacy model for chunking",
    )

    initial_threshold: float = Field(
        default=0.6,
        description=(
            "The value of semantic similarity that must be exceeded between two"
            "sentences to create a new chunk.  The bigger this "
            "value is, the more nodes will be generated. Range is from 0 to 1."
        ),
    )

    appending_threshold: float = Field(
        default=0.8,
        description=(
            "The value of semantic similarity that must be exceeded between a "
            "chunk and new sentence to add this sentence to existing chunk.  The bigger this "
            "value is, the more nodes will be generated. Range is from 0 to 1."
        ),
    )

    merging_threshold: float = Field(
        default=0.8,
        description=(
            "The value of semantic similarity that must be exceeded between two chunks "
            "to form a bigger chunk.  The bigger this value is,"
            "the more nodes will be generated. Range is from 0 to 1."
        ),
    )

    max_chunk_size: int = Field(
        default=1000,
        description="Maximum length of chunk that can be subjected to verification (number of characters)",
    )

    merging_range: int = Field(
        default=1,
        description=(
            "How many chunks 'ahead' beyond the nearest neighbor"
            "should the algorithm check during the second pass"
            "(possible options are 1 or 2"
        ),
    )

    merging_separator: str = Field(
        default=" ",
        description="The separator to use when merging chunks. Defaults to a single space.",
    )

    sentence_splitter: Callable[[str], List[str]] = Field(
        default_factory=split_by_sentence_tokenizer,
        description="The text splitter to use when splitting documents.",
        exclude=True,
    )

    @classmethod
    def class_name(cls) -> str:
        return "SemanticDoubleMergingSplitterNodeParser"

    @classmethod
    def from_defaults(
        cls,
        language_config: Optional[LanguageConfig] = LanguageConfig(),
        initial_threshold: Optional[float] = 0.6,
        appending_threshold: Optional[float] = 0.8,
        merging_threshold: Optional[float] = 0.8,
        max_chunk_size: Optional[int] = 1000,
        merging_range: Optional[int] = 1,
        merging_separator: Optional[str] = " ",
        sentence_splitter: Optional[Callable[[str], List[str]]] = None,
        original_text_metadata_key: str = DEFAULT_OG_TEXT_METADATA_KEY,
        include_metadata: bool = True,
        include_prev_next_rel: bool = True,
        callback_manager: Optional[CallbackManager] = None,
        id_func: Optional[Callable[[int, Document], str]] = None,
    ) -> "SemanticDoubleMergingSplitterNodeParser":
        callback_manager = callback_manager or CallbackManager([])

        sentence_splitter = sentence_splitter or split_by_sentence_tokenizer()

        id_func = id_func or default_id_func

        return cls(
            language_config=language_config,
            initial_threshold=initial_threshold,
            appending_threshold=appending_threshold,
            merging_threshold=merging_threshold,
            max_chunk_size=max_chunk_size,
            merging_range=merging_range,
            merging_separator=merging_separator,
            sentence_splitter=sentence_splitter,
            original_text_metadata_key=original_text_metadata_key,
            include_metadata=include_metadata,
            include_prev_next_rel=include_prev_next_rel,
            callback_manager=callback_manager,
            id_func=id_func,
        )

    def _parse_nodes(
        self,
        nodes: Sequence[BaseNode],
        show_progress: bool = False,
        **kwargs: Any,
    ) -> List[BaseNode]:
        """Parse document into nodes."""
        # Load model
        self.language_config.load_model()

        all_nodes: List[BaseNode] = []
        nodes_with_progress = get_tqdm_iterable(nodes, show_progress, "Parsing nodes")

        for node in nodes_with_progress:
            nodes = self.build_semantic_nodes_from_nodes([node])
            all_nodes.extend(nodes)
        return all_nodes

    def build_semantic_nodes_from_documents(
        self,
        documents: Sequence[Document],
    ) -> List[BaseNode]:
        """Build window nodes from documents."""
        return self.build_semantic_nodes_from_nodes(documents)

    def build_semantic_nodes_from_nodes(
        self,
        nodes: Sequence[BaseNode],
    ) -> List[BaseNode]:
        """Build window nodes from nodes."""
        all_nodes: List[BaseNode] = []

        for node in nodes:
            text = node.get_content()
            sentences = self.sentence_splitter(text)
            sentences = [s.strip() for s in sentences]
            initial_chunks = self._create_initial_chunks(sentences)
            chunks = self._merge_initial_chunks(initial_chunks)

            split_nodes = build_nodes_from_splits(
                chunks,
                node,
                id_func=self.id_func,
            )

            previous_node: Optional[BaseNode] = None
            for split_node in split_nodes:
                if previous_node:
                    split_node.relationships[NodeRelationship.PREVIOUS] = (
                        previous_node.as_related_node_info()
                    )
                    previous_node.relationships[NodeRelationship.NEXT] = (
                        split_node.as_related_node_info()
                    )
                previous_node = split_node
            all_nodes.extend(split_nodes)

        return all_nodes

    def _create_initial_chunks(self, sentences: List[str]) -> List[str]:
        initial_chunks: List[str] = []
        chunk = sentences[0]  # ""
        new = True

        assert self.language_config.nlp is not None

        for sentence in sentences[1:]:
            if new:
                # check if 2 sentences got anything in common

                if (
                    self.language_config.nlp(
                        self._clean_text_advanced(chunk)
                    ).similarity(
                        self.language_config.nlp(self._clean_text_advanced(sentence))
                    )
                    < self.initial_threshold
                    and len(chunk) + len(sentence) + 1 <= self.max_chunk_size
                ):
                    # if not then leave first sentence as separate chunk
                    initial_chunks.append(chunk)
                    chunk = sentence
                    continue

                chunk_sentences = [chunk]
                if len(chunk) + len(sentence) + 1 <= self.max_chunk_size:
                    chunk_sentences.append(sentence)
                    chunk = self.merging_separator.join(chunk_sentences)
                    new = False
                else:
                    new = True
                    initial_chunks.append(chunk)
                    chunk = sentence
                    continue
                last_sentences = self.merging_separator.join(chunk_sentences[-2:])
                # new = False

            elif (
                self.language_config.nlp(
                    self._clean_text_advanced(last_sentences)
                ).similarity(
                    self.language_config.nlp(self._clean_text_advanced(sentence))
                )
                > self.appending_threshold
                and len(chunk) + len(sentence) + 1 <= self.max_chunk_size
            ):
                # elif nlp(last_sentences).similarity(nlp(sentence)) > self.threshold:
                chunk_sentences.append(sentence)
                last_sentences = self.merging_separator.join(chunk_sentences[-2:])
                chunk += self.merging_separator + sentence
            else:
                initial_chunks.append(chunk)
                chunk = sentence  # ""
                new = True
        initial_chunks.append(chunk)

        return initial_chunks

    def _merge_initial_chunks(self, initial_chunks: List[str]) -> List[str]:
        chunks: List[str] = []
        skip = 0
        current = initial_chunks[0]

        assert self.language_config.nlp is not None

        # TODO avoid connecting 1st chunk with 3rd if 2nd one is above some value, or if its length is above some value

        for i in range(1, len(initial_chunks)):
            # avoid connecting same chunk multiple times
            if skip > 0:
                skip -= 1
                continue

            current_nlp = self.language_config.nlp(self._clean_text_advanced(current))

            if len(current) >= self.max_chunk_size:
                chunks.append(current)
                current = initial_chunks[i]

            # check if 1st and 2nd chunk should be connected
            elif (
                current_nlp.similarity(
                    self.language_config.nlp(
                        self._clean_text_advanced(initial_chunks[i])
                    )
                )
                > self.merging_threshold
                and len(current) + len(initial_chunks[i]) + 1 <= self.max_chunk_size
            ):
                current += self.merging_separator + initial_chunks[i]

            # check if 1st and 3rd chunk are similar, if yes then merge 1st, 2nd, 3rd together
            elif (
                i <= len(initial_chunks) - 2
                and current_nlp.similarity(
                    self.language_config.nlp(
                        self._clean_text_advanced(initial_chunks[i + 1])
                    )
                )
                > self.merging_threshold
                and len(current)
                + len(initial_chunks[i])
                + len(initial_chunks[i + 1])
                + 2
                <= self.max_chunk_size
            ):
                current += (
                    self.merging_separator
                    + initial_chunks[i]
                    + self.merging_separator
                    + initial_chunks[i + 1]
                )
                skip = 1

            # check if 1st and 4th chunk are smilar, if yes then merge 1st, 2nd, 3rd and 4th together
            elif (
                i < len(initial_chunks) - 2
                and current_nlp.similarity(
                    self.language_config.nlp(
                        self._clean_text_advanced(initial_chunks[i + 2])
                    )
                )
                > self.merging_threshold
                and self.merging_range == 2
                and len(current)
                + len(initial_chunks[i])
                + len(initial_chunks[i + 1])
                + len(initial_chunks[i + 2])
                + 3
                <= self.max_chunk_size
            ):
                current += (
                    self.merging_separator
                    + initial_chunks[i]
                    + self.merging_separator
                    + initial_chunks[i + 1]
                    + self.merging_separator
                    + initial_chunks[i + 2]
                )
                skip = 2

            else:
                chunks.append(current)
                current = initial_chunks[i]

        chunks.append(current)
        return chunks

    def _clean_text_advanced(self, text: str) -> str:
        text = text.lower()
        # Remove urls
        text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
        # Remove punctuations
        text = text.translate(str.maketrans("", "", string.punctuation))
        # Remove stopwords
        tokens = globals_helper.punkt_tokenizer.tokenize(text)
        filtered_words = [w for w in tokens if w not in self.language_config.stopwords]

        return " ".join(filtered_words)

build_semantic_nodes_from_documents #

build_semantic_nodes_from_documents(documents: Sequence[文档]) -> List[BaseNode]

从文档构建窗口节点。

workflows/handler.py 中的源代码llama_index/core/node_parser/text/semantic_double_merging_splitter.py

def build_semantic_nodes_from_documents(
    self,
    documents: Sequence[Document],
) -> List[BaseNode]:
    """Build window nodes from documents."""
    return self.build_semantic_nodes_from_nodes(documents)

build_semantic_nodes_from_nodes #

build_semantic_nodes_from_nodes(nodes: Sequence[BaseNode]) -> List[BaseNode]

从节点构建窗口节点。

workflows/handler.py 中的源代码llama_index/core/node_parser/text/semantic_double_merging_splitter.py

def build_semantic_nodes_from_nodes(
    self,
    nodes: Sequence[BaseNode],
) -> List[BaseNode]:
    """Build window nodes from nodes."""
    all_nodes: List[BaseNode] = []

    for node in nodes:
        text = node.get_content()
        sentences = self.sentence_splitter(text)
        sentences = [s.strip() for s in sentences]
        initial_chunks = self._create_initial_chunks(sentences)
        chunks = self._merge_initial_chunks(initial_chunks)

        split_nodes = build_nodes_from_splits(
            chunks,
            node,
            id_func=self.id_func,
        )

        previous_node: Optional[BaseNode] = None
        for split_node in split_nodes:
            if previous_node:
                split_node.relationships[NodeRelationship.PREVIOUS] = (
                    previous_node.as_related_node_info()
                )
                previous_node.relationships[NodeRelationship.NEXT] = (
                    split_node.as_related_node_info()
                )
            previous_node = split_node
        all_nodes.extend(split_nodes)

    return all_nodes

句子分割器 #

基类：EventMetadataAwareTextSplitter

解析文本时优先考虑完整句子。

总的来说，这个类会尽量保持句子和段落的完整性。因此，与原始的TokenTextSplitter相比，在节点块末尾出现悬挂句子或句子片段的可能性更小。

参数：

名称	类型	描述	默认
`chunk_size`	`int`	每个分块的令牌大小。	`1024`
`chunk_overlap`	`int`	每个分块拆分时的令牌重叠量。	`200`
`separator`	`str`	默认用于分割成单词的分隔符	`' '`
`paragraph_separator`	`str`	段落之间的分隔符。	`'\n\n\n'`
`secondary_chunking_regex`	`str \| None`	用于分割句子的备用正则表达式。	`'[^,.;。？！]+[,.;。？！]?\|[,.;。？！]'`

workflows/handler.py 中的源代码llama_index/core/node_parser/text/sentence.py

class SentenceSplitter(MetadataAwareTextSplitter):
    """
    Parse text with a preference for complete sentences.

    In general, this class tries to keep sentences and paragraphs together. Therefore
    compared to the original TokenTextSplitter, there are less likely to be
    hanging sentences or parts of sentences at the end of the node chunk.
    """

    chunk_size: int = Field(
        default=DEFAULT_CHUNK_SIZE,
        description="The token chunk size for each chunk.",
        gt=0,
    )
    chunk_overlap: int = Field(
        default=SENTENCE_CHUNK_OVERLAP,
        description="The token overlap of each chunk when splitting.",
        ge=0,
    )
    separator: str = Field(
        default=" ", description="Default separator for splitting into words"
    )
    paragraph_separator: str = Field(
        default=DEFAULT_PARAGRAPH_SEP, description="Separator between paragraphs."
    )
    secondary_chunking_regex: Optional[str] = Field(
        default=CHUNKING_REGEX, description="Backup regex for splitting into sentences."
    )

    _chunking_tokenizer_fn: Callable[[str], List[str]] = PrivateAttr()
    _tokenizer: Callable = PrivateAttr()
    _split_fns: List[Callable] = PrivateAttr()
    _sub_sentence_split_fns: List[Callable] = PrivateAttr()

    def __init__(
        self,
        separator: str = " ",
        chunk_size: int = DEFAULT_CHUNK_SIZE,
        chunk_overlap: int = SENTENCE_CHUNK_OVERLAP,
        tokenizer: Optional[Callable] = None,
        paragraph_separator: str = DEFAULT_PARAGRAPH_SEP,
        chunking_tokenizer_fn: Optional[Callable[[str], List[str]]] = None,
        secondary_chunking_regex: Optional[str] = CHUNKING_REGEX,
        callback_manager: Optional[CallbackManager] = None,
        include_metadata: bool = True,
        include_prev_next_rel: bool = True,
        id_func: Optional[Callable] = None,
    ):
        """Initialize with parameters."""
        if chunk_overlap > chunk_size:
            raise ValueError(
                f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
                f"({chunk_size}), should be smaller."
            )
        id_func = id_func or default_id_func
        callback_manager = callback_manager or CallbackManager([])
        super().__init__(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            secondary_chunking_regex=secondary_chunking_regex,
            separator=separator,
            paragraph_separator=paragraph_separator,
            callback_manager=callback_manager,
            include_metadata=include_metadata,
            include_prev_next_rel=include_prev_next_rel,
            id_func=id_func,
        )
        self._chunking_tokenizer_fn = (
            chunking_tokenizer_fn or split_by_sentence_tokenizer()
        )
        self._tokenizer = tokenizer or get_tokenizer()

        self._split_fns = [
            split_by_sep(paragraph_separator),
            self._chunking_tokenizer_fn,
        ]

        if secondary_chunking_regex:
            self._sub_sentence_split_fns = [
                split_by_regex(secondary_chunking_regex),
                split_by_sep(separator),
                split_by_char(),
            ]
        else:
            self._sub_sentence_split_fns = [
                split_by_sep(separator),
                split_by_char(),
            ]

    @classmethod
    def from_defaults(
        cls,
        separator: str = " ",
        chunk_size: int = DEFAULT_CHUNK_SIZE,
        chunk_overlap: int = SENTENCE_CHUNK_OVERLAP,
        tokenizer: Optional[Callable] = None,
        paragraph_separator: str = DEFAULT_PARAGRAPH_SEP,
        chunking_tokenizer_fn: Optional[Callable[[str], List[str]]] = None,
        secondary_chunking_regex: str = CHUNKING_REGEX,
        callback_manager: Optional[CallbackManager] = None,
        include_metadata: bool = True,
        include_prev_next_rel: bool = True,
    ) -> "SentenceSplitter":
        """Initialize with parameters."""
        callback_manager = callback_manager or CallbackManager([])
        return cls(
            separator=separator,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            tokenizer=tokenizer,
            paragraph_separator=paragraph_separator,
            chunking_tokenizer_fn=chunking_tokenizer_fn,
            secondary_chunking_regex=secondary_chunking_regex,
            callback_manager=callback_manager,
            include_metadata=include_metadata,
            include_prev_next_rel=include_prev_next_rel,
        )

    @classmethod
    def class_name(cls) -> str:
        return "SentenceSplitter"

    def split_text_metadata_aware(self, text: str, metadata_str: str) -> List[str]:
        metadata_len = len(self._tokenizer(metadata_str))
        effective_chunk_size = self.chunk_size - metadata_len
        if effective_chunk_size <= 0:
            raise ValueError(
                f"Metadata length ({metadata_len}) is longer than chunk size "
                f"({self.chunk_size}). Consider increasing the chunk size or "
                "decreasing the size of your metadata to avoid this."
            )
        elif effective_chunk_size < 50:
            print(
                f"Metadata length ({metadata_len}) is close to chunk size "
                f"({self.chunk_size}). Resulting chunks are less than 50 tokens. "
                "Consider increasing the chunk size or decreasing the size of "
                "your metadata to avoid this.",
                flush=True,
            )

        return self._split_text(text, chunk_size=effective_chunk_size)

    def split_text(self, text: str) -> List[str]:
        return self._split_text(text, chunk_size=self.chunk_size)

    def _split_text(self, text: str, chunk_size: int) -> List[str]:
        """
        _Split incoming text and return chunks with overlap size.

        Has a preference for complete sentences, phrases, and minimal overlap.
        """
        if text == "":
            return [text]

        with self.callback_manager.event(
            CBEventType.CHUNKING, payload={EventPayload.CHUNKS: [text]}
        ) as event:
            splits = self._split(text, chunk_size)
            chunks = self._merge(splits, chunk_size)

            event.on_end(payload={EventPayload.CHUNKS: chunks})

        return chunks

    def _split(self, text: str, chunk_size: int) -> List[_Split]:
        r"""
        Break text into splits that are smaller than chunk size.

        The order of splitting is:
        1. split by paragraph separator
        2. split by chunking tokenizer (default is nltk sentence tokenizer)
        3. split by second chunking regex (default is "[^,\.;]+[,\.;]?")
        4. split by default separator (" ")

        """
        token_size = self._token_size(text)
        if token_size <= chunk_size:
            return [_Split(text, is_sentence=True, token_size=token_size)]

        text_splits_by_fns, is_sentence = self._get_splits_by_fns(text)

        text_splits = []
        for text_split_by_fns in text_splits_by_fns:
            token_size = self._token_size(text_split_by_fns)
            if token_size <= chunk_size:
                text_splits.append(
                    _Split(
                        text_split_by_fns,
                        is_sentence=is_sentence,
                        token_size=token_size,
                    )
                )
            else:
                recursive_text_splits = self._split(
                    text_split_by_fns, chunk_size=chunk_size
                )
                text_splits.extend(recursive_text_splits)
        return text_splits

    def _merge(self, splits: List[_Split], chunk_size: int) -> List[str]:
        """Merge splits into chunks."""
        chunks: List[str] = []
        cur_chunk: List[Tuple[str, int]] = []  # list of (text, length)
        last_chunk: List[Tuple[str, int]] = []
        cur_chunk_len = 0
        new_chunk = True

        def close_chunk() -> None:
            nonlocal chunks, cur_chunk, last_chunk, cur_chunk_len, new_chunk

            chunks.append("".join([text for text, length in cur_chunk]))
            last_chunk = cur_chunk
            cur_chunk = []
            cur_chunk_len = 0
            new_chunk = True

            # add overlap to the next chunk using the last one first
            if len(last_chunk) > 0:
                last_index = len(last_chunk) - 1
                while (
                    last_index >= 0
                    and cur_chunk_len + last_chunk[last_index][1] <= self.chunk_overlap
                ):
                    overlap_text, overlap_length = last_chunk[last_index]
                    cur_chunk_len += overlap_length
                    cur_chunk.insert(0, (overlap_text, overlap_length))
                    last_index -= 1

        split_idx = 0
        while split_idx < len(splits):
            cur_split = splits[split_idx]
            if cur_split.token_size > chunk_size:
                raise ValueError("Single token exceeded chunk size")
            if cur_chunk_len + cur_split.token_size > chunk_size and not new_chunk:
                # if adding split to current chunk exceeds chunk size: close out chunk
                close_chunk()
            else:
                # If this is a new chunk with overlap, and adding the split would
                # exceed chunk_size, remove overlap to make room
                if new_chunk and cur_chunk_len + cur_split.token_size > chunk_size:
                    # Remove overlap from the beginning until split fits
                    while (
                        len(cur_chunk) > 0
                        and cur_chunk_len + cur_split.token_size > chunk_size
                    ):
                        _, length = cur_chunk.pop(0)
                        cur_chunk_len -= length

                if (
                    cur_split.is_sentence
                    or cur_chunk_len + cur_split.token_size <= chunk_size
                    or new_chunk  # new chunk, always add at least one split
                ):
                    # add split to chunk
                    cur_chunk_len += cur_split.token_size
                    cur_chunk.append((cur_split.text, cur_split.token_size))
                    split_idx += 1
                    new_chunk = False
                else:
                    # close out chunk
                    close_chunk()

        # handle the last chunk
        if not new_chunk:
            chunk = "".join([text for text, length in cur_chunk])
            chunks.append(chunk)

        # run postprocessing to remove blank spaces
        return self._postprocess_chunks(chunks)

    def _postprocess_chunks(self, chunks: List[str]) -> List[str]:
        """
        Post-process chunks.
        Remove whitespace only chunks and remove leading and trailing whitespace.
        """
        new_chunks = []
        for chunk in chunks:
            stripped_chunk = chunk.strip()
            if stripped_chunk == "":
                continue
            new_chunks.append(stripped_chunk)
        return new_chunks

    def _token_size(self, text: str) -> int:
        return len(self._tokenizer(text))

    def _get_splits_by_fns(self, text: str) -> Tuple[List[str], bool]:
        for split_fn in self._split_fns:
            splits = split_fn(text)
            if len(splits) > 1:
                return splits, True

        for split_fn in self._sub_sentence_split_fns:
            splits = split_fn(text)
            if len(splits) > 1:
                break

        return splits, False

from_defaults `classmethod` #

from_defaults(separator: str = ' ', chunk_size: int = DEFAULT_CHUNK_SIZE, chunk_overlap: int = SENTENCE_CHUNK_OVERLAP, tokenizer: Optional[Callable] = None, paragraph_separator: str = DEFAULT_PARAGRAPH_SEP, chunking_tokenizer_fn: Optional[Callable[[str], List[str]]] = None, secondary_chunking_regex: str = CHUNKING_REGEX, callback_manager: Optional[CallbackManager] = None, include_metadata: bool = True, include_prev_next_rel: bool = True) -> SentenceSplitter

使用参数进行初始化。

workflows/handler.py 中的源代码llama_index/core/node_parser/text/sentence.py

@classmethod
def from_defaults(
    cls,
    separator: str = " ",
    chunk_size: int = DEFAULT_CHUNK_SIZE,
    chunk_overlap: int = SENTENCE_CHUNK_OVERLAP,
    tokenizer: Optional[Callable] = None,
    paragraph_separator: str = DEFAULT_PARAGRAPH_SEP,
    chunking_tokenizer_fn: Optional[Callable[[str], List[str]]] = None,
    secondary_chunking_regex: str = CHUNKING_REGEX,
    callback_manager: Optional[CallbackManager] = None,
    include_metadata: bool = True,
    include_prev_next_rel: bool = True,
) -> "SentenceSplitter":
    """Initialize with parameters."""
    callback_manager = callback_manager or CallbackManager([])
    return cls(
        separator=separator,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        tokenizer=tokenizer,
        paragraph_separator=paragraph_separator,
        chunking_tokenizer_fn=chunking_tokenizer_fn,
        secondary_chunking_regex=secondary_chunking_regex,
        callback_manager=callback_manager,
        include_metadata=include_metadata,
        include_prev_next_rel=include_prev_next_rel,
    )

句子窗口节点解析器 #

基类：EventNodeParser

句子窗口节点解析器。

将文档分割为节点，每个节点代表一个句子。每个节点在元数据中包含来自周围句子的窗口。

参数：

名称	类型	描述	默认
`sentence_splitter`	`Optional[Callable]`	将文本分割成句子	`<function split_by_sentence_tokenizer.<locals>.<lambda> at 0x7f82dcb30c20>`
`include_metadata`	`bool`	是否在节点中包含元数据	required
`include_prev_next_rel`	`bool`	是否包含上一页/下一页关系	required
`window_size`	`int`	要捕获的句子两侧的句子数量。	`3`
`window_metadata_key`	`str`	用于存储句子窗口的元数据键。	`'window'`
`original_text_metadata_key`	`str`	用于存储原始句子的元数据键。	`'original_text'`

workflows/handler.py 中的源代码llama_index/core/node_parser/text/sentence_window.py

class SentenceWindowNodeParser(NodeParser):
    """
    Sentence window node parser.

    Splits a document into Nodes, with each node being a sentence.
    Each node contains a window from the surrounding sentences in the metadata.

    Args:
        sentence_splitter (Optional[Callable]): splits text into sentences
        include_metadata (bool): whether to include metadata in nodes
        include_prev_next_rel (bool): whether to include prev/next relationships

    """

    sentence_splitter: Callable[[str], List[str]] = Field(
        default_factory=split_by_sentence_tokenizer,
        description="The text splitter to use when splitting documents.",
        exclude=True,
    )
    window_size: int = Field(
        default=DEFAULT_WINDOW_SIZE,
        description="The number of sentences on each side of a sentence to capture.",
        gt=0,
    )
    window_metadata_key: str = Field(
        default=DEFAULT_WINDOW_METADATA_KEY,
        description="The metadata key to store the sentence window under.",
    )
    original_text_metadata_key: str = Field(
        default=DEFAULT_OG_TEXT_METADATA_KEY,
        description="The metadata key to store the original sentence in.",
    )

    @classmethod
    def class_name(cls) -> str:
        return "SentenceWindowNodeParser"

    @classmethod
    def from_defaults(
        cls,
        sentence_splitter: Optional[Callable[[str], List[str]]] = None,
        window_size: int = DEFAULT_WINDOW_SIZE,
        window_metadata_key: str = DEFAULT_WINDOW_METADATA_KEY,
        original_text_metadata_key: str = DEFAULT_OG_TEXT_METADATA_KEY,
        include_metadata: bool = True,
        include_prev_next_rel: bool = True,
        callback_manager: Optional[CallbackManager] = None,
        id_func: Optional[Callable[[int, Document], str]] = None,
    ) -> "SentenceWindowNodeParser":
        callback_manager = callback_manager or CallbackManager([])

        sentence_splitter = sentence_splitter or split_by_sentence_tokenizer()

        id_func = id_func or default_id_func

        return cls(
            sentence_splitter=sentence_splitter,
            window_size=window_size,
            window_metadata_key=window_metadata_key,
            original_text_metadata_key=original_text_metadata_key,
            include_metadata=include_metadata,
            include_prev_next_rel=include_prev_next_rel,
            callback_manager=callback_manager,
            id_func=id_func,
        )

    def _parse_nodes(
        self,
        nodes: Sequence[BaseNode],
        show_progress: bool = False,
        **kwargs: Any,
    ) -> List[BaseNode]:
        """Parse document into nodes."""
        all_nodes: List[BaseNode] = []
        nodes_with_progress = get_tqdm_iterable(nodes, show_progress, "Parsing nodes")

        for node in nodes_with_progress:
            nodes = self.build_window_nodes_from_documents([node])
            all_nodes.extend(nodes)

        return all_nodes

    def build_window_nodes_from_documents(
        self, documents: Sequence[Document]
    ) -> List[BaseNode]:
        """Build window nodes from documents."""
        all_nodes: List[BaseNode] = []
        for doc in documents:
            text = doc.text
            text_splits = self.sentence_splitter(text)
            nodes = build_nodes_from_splits(
                text_splits,
                doc,
                id_func=self.id_func,
            )

            # add window to each node
            for i, node in enumerate(nodes):
                window_nodes = nodes[
                    max(0, i - self.window_size) : min(
                        i + self.window_size + 1, len(nodes)
                    )
                ]

                node.metadata[self.window_metadata_key] = " ".join(
                    [n.text for n in window_nodes]
                )
                node.metadata[self.original_text_metadata_key] = node.text

                # exclude window metadata from embed and llm
                node.excluded_embed_metadata_keys.extend(
                    [self.window_metadata_key, self.original_text_metadata_key]
                )
                node.excluded_llm_metadata_keys.extend(
                    [self.window_metadata_key, self.original_text_metadata_key]
                )

            all_nodes.extend(nodes)

        return all_nodes

build_window_nodes_from_documents #

build_window_nodes_from_documents(documents: Sequence[文档]) -> List[BaseNode]

从文档构建窗口节点。

workflows/handler.py 中的源代码llama_index/core/node_parser/text/sentence_window.py

def build_window_nodes_from_documents(
    self, documents: Sequence[Document]
) -> List[BaseNode]:
    """Build window nodes from documents."""
    all_nodes: List[BaseNode] = []
    for doc in documents:
        text = doc.text
        text_splits = self.sentence_splitter(text)
        nodes = build_nodes_from_splits(
            text_splits,
            doc,
            id_func=self.id_func,
        )

        # add window to each node
        for i, node in enumerate(nodes):
            window_nodes = nodes[
                max(0, i - self.window_size) : min(
                    i + self.window_size + 1, len(nodes)
                )
            ]

            node.metadata[self.window_metadata_key] = " ".join(
                [n.text for n in window_nodes]
            )
            node.metadata[self.original_text_metadata_key] = node.text

            # exclude window metadata from embed and llm
            node.excluded_embed_metadata_keys.extend(
                [self.window_metadata_key, self.original_text_metadata_key]
            )
            node.excluded_llm_metadata_keys.extend(
                [self.window_metadata_key, self.original_text_metadata_key]
            )

        all_nodes.extend(nodes)

    return all_nodes

令牌文本分割器 #

基类：EventMetadataAwareTextSplitter

实现基于词语标记的文本分割方法。

参数：

名称	类型	描述	默认
`chunk_size`	`int`	每个分块的令牌大小。	`1024`
`chunk_overlap`	`int`	每个分块拆分时的令牌重叠量。	`20`
`separator`	`str`	默认用于分割成单词的分隔符	`' '`
`backup_separators`	`List`	用于分割的额外分隔符。	`<dynamic>`
`keep_whitespaces`	`bool`	是否在文本块中保留前导/尾随空格。	`False`

workflows/handler.py 中的源代码llama_index/core/node_parser/text/token.py

class TokenTextSplitter(MetadataAwareTextSplitter):
    """Implementation of splitting text that looks at word tokens."""

    chunk_size: int = Field(
        default=DEFAULT_CHUNK_SIZE,
        description="The token chunk size for each chunk.",
        gt=0,
    )
    chunk_overlap: int = Field(
        default=DEFAULT_CHUNK_OVERLAP,
        description="The token overlap of each chunk when splitting.",
        ge=0,
    )
    separator: str = Field(
        default=" ", description="Default separator for splitting into words"
    )
    backup_separators: List = Field(
        default_factory=list, description="Additional separators for splitting."
    )

    keep_whitespaces: bool = Field(
        default=False,
        description="Whether to keep leading/trailing whitespaces in the chunk.",
    )

    _tokenizer: Callable = PrivateAttr()
    _split_fns: List[Callable] = PrivateAttr()

    def __init__(
        self,
        chunk_size: int = DEFAULT_CHUNK_SIZE,
        chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
        tokenizer: Optional[Callable] = None,
        callback_manager: Optional[CallbackManager] = None,
        separator: str = " ",
        backup_separators: Optional[List[str]] = ["\n"],
        keep_whitespaces: bool = False,
        include_metadata: bool = True,
        include_prev_next_rel: bool = True,
        id_func: Optional[Callable[[int, Document], str]] = None,
    ):
        """Initialize with parameters."""
        if chunk_overlap > chunk_size:
            raise ValueError(
                f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
                f"({chunk_size}), should be smaller."
            )
        callback_manager = callback_manager or CallbackManager([])
        id_func = id_func or default_id_func
        super().__init__(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separator=separator,
            backup_separators=backup_separators,
            keep_whitespaces=keep_whitespaces,
            callback_manager=callback_manager,
            include_metadata=include_metadata,
            include_prev_next_rel=include_prev_next_rel,
            id_func=id_func,
        )
        self._tokenizer = tokenizer or get_tokenizer()
        all_seps = [separator] + (backup_separators or [])
        self._split_fns = [split_by_sep(sep) for sep in all_seps] + [split_by_char()]

    @classmethod
    def from_defaults(
        cls,
        chunk_size: int = DEFAULT_CHUNK_SIZE,
        chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
        separator: str = " ",
        backup_separators: Optional[List[str]] = ["\n"],
        callback_manager: Optional[CallbackManager] = None,
        keep_whitespaces: bool = False,
        include_metadata: bool = True,
        include_prev_next_rel: bool = True,
        id_func: Optional[Callable[[int, Document], str]] = None,
    ) -> "TokenTextSplitter":
        """Initialize with default parameters."""
        callback_manager = callback_manager or CallbackManager([])
        return cls(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separator=separator,
            backup_separators=backup_separators,
            keep_whitespaces=keep_whitespaces,
            callback_manager=callback_manager,
            include_metadata=include_metadata,
            include_prev_next_rel=include_prev_next_rel,
            id_func=id_func,
        )

    @classmethod
    def class_name(cls) -> str:
        return "TokenTextSplitter"

    def split_text_metadata_aware(self, text: str, metadata_str: str) -> List[str]:
        """Split text into chunks, reserving space required for metadata str."""
        metadata_len = len(self._tokenizer(metadata_str)) + DEFAULT_METADATA_FORMAT_LEN
        effective_chunk_size = self.chunk_size - metadata_len
        if effective_chunk_size <= 0:
            raise ValueError(
                f"Metadata length ({metadata_len}) is longer than chunk size "
                f"({self.chunk_size}). Consider increasing the chunk size or "
                "decreasing the size of your metadata to avoid this."
            )
        elif effective_chunk_size < 50:
            print(
                f"Metadata length ({metadata_len}) is close to chunk size "
                f"({self.chunk_size}). Resulting chunks are less than 50 tokens. "
                "Consider increasing the chunk size or decreasing the size of "
                "your metadata to avoid this.",
                flush=True,
            )

        return self._split_text(text, chunk_size=effective_chunk_size)

    def split_text(self, text: str) -> List[str]:
        """Split text into chunks."""
        return self._split_text(text, chunk_size=self.chunk_size)

    def _split_text(self, text: str, chunk_size: int) -> List[str]:
        """Split text into chunks up to chunk_size."""
        if text == "":
            return [text]

        with self.callback_manager.event(
            CBEventType.CHUNKING, payload={EventPayload.CHUNKS: [text]}
        ) as event:
            splits = self._split(text, chunk_size)
            chunks = self._merge(splits, chunk_size)

            event.on_end(
                payload={EventPayload.CHUNKS: chunks},
            )

        return chunks

    def _split(self, text: str, chunk_size: int) -> List[str]:
        """
        Break text into splits that are smaller than chunk size.

        The order of splitting is:
        1. split by separator
        2. split by backup separators (if any)
        3. split by characters

        NOTE: the splits contain the separators.
        """
        if len(self._tokenizer(text)) <= chunk_size:
            return [text]

        for split_fn in self._split_fns:
            splits = split_fn(text)
            if len(splits) > 1:
                break

        new_splits = []
        for split in splits:
            split_len = len(self._tokenizer(split))
            if split_len <= chunk_size:
                new_splits.append(split)
            else:
                # recursively split
                new_splits.extend(self._split(split, chunk_size=chunk_size))
        return new_splits

    def _merge(self, splits: List[str], chunk_size: int) -> List[str]:
        """
        Merge splits into chunks.

        The high-level idea is to keep adding splits to a chunk until we
        exceed the chunk size, then we start a new chunk with overlap.

        When we start a new chunk, we pop off the first element of the previous
        chunk until the total length is less than the chunk size.
        """
        chunks: List[str] = []

        cur_chunk: List[str] = []
        cur_len = 0
        for split in splits:
            split_len = len(self._tokenizer(split))
            if split_len > chunk_size:
                _logger.warning(
                    f"Got a split of size {split_len}, ",
                    f"larger than chunk size {chunk_size}.",
                )

            # if we exceed the chunk size after adding the new split, then
            # we need to end the current chunk and start a new one
            if cur_len + split_len > chunk_size:
                # end the previous chunk
                chunk = (
                    "".join(cur_chunk)
                    if self.keep_whitespaces
                    else "".join(cur_chunk).strip()
                )
                if chunk:
                    chunks.append(chunk)

                # start a new chunk with overlap
                # keep popping off the first element of the previous chunk until:
                #   1. the current chunk length is less than chunk overlap
                #   2. the total length is less than chunk size
                while cur_len > self.chunk_overlap or cur_len + split_len > chunk_size:
                    # pop off the first element
                    first_chunk = cur_chunk.pop(0)
                    cur_len -= len(self._tokenizer(first_chunk))

            cur_chunk.append(split)
            cur_len += split_len

        # handle the last chunk
        chunk = (
            "".join(cur_chunk) if self.keep_whitespaces else "".join(cur_chunk).strip()
        )
        if chunk:
            chunks.append(chunk)

        return chunks

from_defaults `classmethod` #

from_defaults(chunk_size: int = DEFAULT_CHUNK_SIZE, chunk_overlap: int = DEFAULT_CHUNK_OVERLAP, separator: str = ' ', backup_separators: Optional[List[str]] = ['\n'], callback_manager: Optional[CallbackManager] = None, keep_whitespaces: bool = False, include_metadata: bool = True, include_prev_next_rel: bool = True, id_func: Optional[Callable[[int, 文档], str]] = None) -> TokenTextSplitter

使用默认参数进行初始化。

workflows/handler.py 中的源代码llama_index/core/node_parser/text/token.py

@classmethod
def from_defaults(
    cls,
    chunk_size: int = DEFAULT_CHUNK_SIZE,
    chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
    separator: str = " ",
    backup_separators: Optional[List[str]] = ["\n"],
    callback_manager: Optional[CallbackManager] = None,
    keep_whitespaces: bool = False,
    include_metadata: bool = True,
    include_prev_next_rel: bool = True,
    id_func: Optional[Callable[[int, Document], str]] = None,
) -> "TokenTextSplitter":
    """Initialize with default parameters."""
    callback_manager = callback_manager or CallbackManager([])
    return cls(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separator=separator,
        backup_separators=backup_separators,
        keep_whitespaces=keep_whitespaces,
        callback_manager=callback_manager,
        include_metadata=include_metadata,
        include_prev_next_rel=include_prev_next_rel,
        id_func=id_func,
    )

split_text_metadata_aware #

split_text_metadata_aware(text: str, metadata_str: str) -> List[str]

将文本分割成块，保留元数据字符串所需的空间。

workflows/handler.py 中的源代码llama_index/core/node_parser/text/token.py

def split_text_metadata_aware(self, text: str, metadata_str: str) -> List[str]:
    """Split text into chunks, reserving space required for metadata str."""
    metadata_len = len(self._tokenizer(metadata_str)) + DEFAULT_METADATA_FORMAT_LEN
    effective_chunk_size = self.chunk_size - metadata_len
    if effective_chunk_size <= 0:
        raise ValueError(
            f"Metadata length ({metadata_len}) is longer than chunk size "
            f"({self.chunk_size}). Consider increasing the chunk size or "
            "decreasing the size of your metadata to avoid this."
        )
    elif effective_chunk_size < 50:
        print(
            f"Metadata length ({metadata_len}) is close to chunk size "
            f"({self.chunk_size}). Resulting chunks are less than 50 tokens. "
            "Consider increasing the chunk size or decreasing the size of "
            "your metadata to avoid this.",
            flush=True,
        )

    return self._split_text(text, chunk_size=effective_chunk_size)

split_text #

split_text(text: str) -> List[str]

将文本分割成块。

workflows/handler.py 中的源代码llama_index/core/node_parser/text/token.py

def split_text(self, text: str) -> List[str]:
    """Split text into chunks."""
    return self._split_text(text, chunk_size=self.chunk_size)

get_leaf_nodes #

get_leaf_nodes(nodes: List[BaseNode]) -> List[BaseNode]

获取叶子节点。

workflows/handler.py 中的源代码llama_index/core/node_parser/relational/hierarchical.py

def get_leaf_nodes(nodes: List[BaseNode]) -> List[BaseNode]:
    """Get leaf nodes."""
    leaf_nodes = []
    for node in nodes:
        if NodeRelationship.CHILD not in node.relationships:
            leaf_nodes.append(node)
    return leaf_nodes

get_root_nodes #

get_root_nodes(nodes: List[BaseNode]) -> List[BaseNode]

获取根节点。

workflows/handler.py 中的源代码llama_index/core/node_parser/relational/hierarchical.py

def get_root_nodes(nodes: List[BaseNode]) -> List[BaseNode]:
    """Get root nodes."""
    root_nodes = []
    for node in nodes:
        if NodeRelationship.PARENT not in node.relationships:
            root_nodes.append(node)
    return root_nodes

get_child_nodes #

get_child_nodes(nodes: List[BaseNode], all_nodes: List[BaseNode]) -> List[BaseNode]

从给定的所有节点中获取节点的子节点。

workflows/handler.py 中的源代码llama_index/core/node_parser/relational/hierarchical.py

def get_child_nodes(nodes: List[BaseNode], all_nodes: List[BaseNode]) -> List[BaseNode]:
    """Get child nodes of nodes from given all_nodes."""
    children_ids = []
    for node in nodes:
        if NodeRelationship.CHILD not in node.relationships:
            continue

        children_ids.extend([r.node_id for r in (node.child_nodes or [])])

    child_nodes = []
    for candidate_node in all_nodes:
        if candidate_node.node_id not in children_ids:
            continue
        child_nodes.append(candidate_node)

    return child_nodes

get_deeper_nodes #

get_deeper_nodes(nodes: List[BaseNode], depth: int = 1) -> List[BaseNode]

获取给定节点中根节点的子节点，这些子节点具有指定深度。

workflows/handler.py 中的源代码llama_index/core/node_parser/relational/hierarchical.py

def get_deeper_nodes(nodes: List[BaseNode], depth: int = 1) -> List[BaseNode]:
    """Get children of root nodes in given nodes that have given depth."""
    if depth < 0:
        raise ValueError("Depth cannot be a negative number!")
    root_nodes = get_root_nodes(nodes)
    if not root_nodes:
        raise ValueError("There is no root nodes in given nodes!")

    deeper_nodes = root_nodes
    for _ in range(depth):
        deeper_nodes = get_child_nodes(deeper_nodes, nodes)

    return deeper_nodes

选项：成员：- HTMLNodeParser

Html

HTML节点解析器 #

class_name classmethod #

get_nodes_from_node #

JSON节点解析器 #

class_name classmethod #

get_nodes_from_node #

Markdown节点解析器 #

get_nodes_from_node #

简单文件节点解析器 #

class_name classmethod #

MetadataAwareTextSplitter #

NodeParser #

get_nodes_from_documents #

TextSplitter #

分层节点解析器 #

get_nodes_from_documents #

Markdown元素节点解析器 #

get_nodes_from_node #

aget_nodes_from_node async #

extract_html_tables #

extract_elements #

filter_table #

非结构化元素节点解析器 #

get_nodes_from_node #

aget_nodes_from_node async #

extract_elements #

filter_table #

Llama解析JSON节点解析器 #

get_nodes_from_node #

aget_nodes_from_node async #

extract_elements #

filter_table #

代码分割器 #

from_defaults classmethod #

split_text #

Langchain节点解析器 #

split_text #

语义分割节点解析器 #

build_semantic_nodes_from_documents #

abuild_semantic_nodes_from_documents async #

语义双重合并分割器节点解析器 #

build_semantic_nodes_from_documents #

build_semantic_nodes_from_nodes #

句子分割器 #

from_defaults classmethod #

句子窗口节点解析器 #

build_window_nodes_from_documents #

令牌文本分割器 #

from_defaults classmethod #

split_text_metadata_aware #

split_text #

get_leaf_nodes #

get_root_nodes #

get_child_nodes #

get_deeper_nodes #

class_name `classmethod` #

class_name `classmethod` #

class_name `classmethod` #

aget_nodes_from_node `async` #

aget_nodes_from_node `async` #

aget_nodes_from_node `async` #

from_defaults `classmethod` #

abuild_semantic_nodes_from_documents `async` #

from_defaults `classmethod` #

from_defaults `classmethod` #