网页抓取图谱

网页抓取图工具规范 #

基类：EventBaseToolSpec

用于网页抓取操作的ScrapeGraph工具规范。

该工具提供对 ScrapeGraph AI 网络爬取功能的访问，包括智能爬取、内容转换为 Markdown 格式、搜索功能，以及具有多种选项的基础 HTML 爬取。

workflows/handler.py 中的源代码llama_index/tools/scrapegraph/base.py

class ScrapegraphToolSpec(BaseToolSpec):
    """
    ScrapeGraph tool specification for web scraping operations.

    This tool provides access to ScrapeGraph AI's web scraping capabilities,
    including smart scraping, content conversion to markdown, search functionality,
    and basic HTML scraping with various options.
    """

    spec_functions = [
        "scrapegraph_smartscraper",
        "scrapegraph_markdownify",
        "scrapegraph_search",
        "scrapegraph_scrape",
        "scrapegraph_agentic_scraper",
    ]

    def __init__(self, api_key: Optional[str] = None) -> None:
        """
        Initialize the ScrapeGraph tool specification.

        Args:
            api_key (Optional[str]): ScrapeGraph API key. If not provided,
                                   will attempt to load from environment variable SGAI_API_KEY.

        """
        if api_key:
            self.client = Client(api_key=api_key)
        else:
            self.client = Client.from_env()

    def scrapegraph_smartscraper(
        self,
        prompt: str,
        url: str,
        schema: Optional[Union[List[BaseModel], Dict[str, Any]]] = None,
        **kwargs,
    ) -> Union[List[Dict], Dict]:
        """
        Perform intelligent web scraping using ScrapeGraph's SmartScraper.

        Args:
            prompt (str): User prompt describing what data to extract from the webpage
            url (str): Target website URL to scrape
            schema (Optional[Union[List[BaseModel], Dict]]): Pydantic models or dict defining output structure
            **kwargs: Additional parameters for the SmartScraper

        Returns:
            Union[List[Dict], Dict]: Scraped data matching the provided schema or prompt requirements

        """
        try:
            return self.client.smartscraper(
                website_url=url, user_prompt=prompt, output_schema=schema, **kwargs
            )
        except Exception as e:
            return {"error": f"SmartScraper failed: {e!s}"}

    def scrapegraph_markdownify(self, url: str, **kwargs) -> str:
        """
        Convert webpage content to markdown format using ScrapeGraph.

        Args:
            url (str): Target website URL to convert to markdown
            **kwargs: Additional parameters for the markdownify operation

        Returns:
            str: Markdown representation of the webpage content

        """
        try:
            return self.client.markdownify(website_url=url, **kwargs)
        except Exception as e:
            return f"Markdownify failed: {e!s}"

    def scrapegraph_search(
        self, query: str, max_results: Optional[int] = None, **kwargs
    ) -> str:
        """
        Perform a search query using ScrapeGraph's search functionality.

        Args:
            query (str): Search query to execute
            max_results (Optional[int]): Maximum number of search results to return
            **kwargs: Additional parameters for the search operation

        Returns:
            str: Search results from ScrapeGraph

        """
        try:
            search_params = {"query": query}
            if max_results:
                search_params["max_results"] = max_results
            search_params.update(kwargs)

            return self.client.search(**search_params)
        except Exception as e:
            return f"Search failed: {e!s}"

    def scrapegraph_scrape(
        self,
        url: str,
        render_heavy_js: bool = False,
        headers: Optional[Dict[str, str]] = None,
        **kwargs,
    ) -> Dict[str, Any]:
        """
        Perform basic HTML scraping using ScrapeGraph's scrape functionality.

        Args:
            url (str): Target website URL to scrape
            render_heavy_js (bool): Whether to enable JavaScript rendering for dynamic content
            headers (Optional[Dict[str, str]]): Custom HTTP headers to include in the request
            **kwargs: Additional parameters for the scrape operation

        Returns:
            Dict[str, Any]: Dictionary containing scraped HTML content and metadata

        """
        try:
            scrape_params = {"website_url": url, "render_heavy_js": render_heavy_js}
            if headers:
                scrape_params["headers"] = headers
            scrape_params.update(kwargs)

            return self.client.scrape(**scrape_params)
        except Exception as e:
            return {"error": f"Scrape failed: {e!s}"}

    def scrapegraph_agentic_scraper(
        self,
        prompt: str,
        url: str,
        schema: Optional[Union[List[BaseModel], Dict[str, Any]]] = None,
        **kwargs,
    ) -> Union[List[Dict], Dict]:
        """
        Perform agentic web scraping that can navigate and interact with websites.

        Args:
            prompt (str): User prompt describing the scraping task and navigation requirements
            url (str): Target website URL to start scraping from
            schema (Optional[Union[List[BaseModel], Dict]]): Pydantic models or dict defining output structure
            **kwargs: Additional parameters for the agentic scraper

        Returns:
            Union[List[Dict], Dict]: Scraped data from the agentic navigation and extraction

        """
        try:
            return self.client.agentic_scraper(
                website_url=url, user_prompt=prompt, output_schema=schema, **kwargs
            )
        except Exception as e:
            return {"error": f"Agentic scraper failed: {e!s}"}

scrapegraph_smartscraper #

scrapegraph_smartscraper(prompt: str, url: str, schema: Optional[Union[List[BaseModel], Dict[str, Any]]] = None, **kwargs) -> Union[List[Dict], Dict]

使用 ScrapeGraph 的 SmartScraper 执行智能网页抓取。

参数：

名称	类型	描述	默认
`prompt`	`str`	用户提示描述要从网页中提取哪些数据	required
`url`	`str`	要抓取的目标网站URL	required
`schema`	`Optional[Union[List[BaseModel], Dict]]`	定义输出结构的Pydantic模型或字典	`None`
`**kwargs`		SmartScraper 的附加参数	`{}`

返回：

类型	描述
`Union[List[Dict], Dict]`	Union[List[Dict], Dict]: 符合提供模式或提示要求的抓取数据

workflows/handler.py 中的源代码llama_index/tools/scrapegraph/base.py

def scrapegraph_smartscraper(
    self,
    prompt: str,
    url: str,
    schema: Optional[Union[List[BaseModel], Dict[str, Any]]] = None,
    **kwargs,
) -> Union[List[Dict], Dict]:
    """
    Perform intelligent web scraping using ScrapeGraph's SmartScraper.

    Args:
        prompt (str): User prompt describing what data to extract from the webpage
        url (str): Target website URL to scrape
        schema (Optional[Union[List[BaseModel], Dict]]): Pydantic models or dict defining output structure
        **kwargs: Additional parameters for the SmartScraper

    Returns:
        Union[List[Dict], Dict]: Scraped data matching the provided schema or prompt requirements

    """
    try:
        return self.client.smartscraper(
            website_url=url, user_prompt=prompt, output_schema=schema, **kwargs
        )
    except Exception as e:
        return {"error": f"SmartScraper failed: {e!s}"}

scrapegraph_markdownify #

scrapegraph_markdownify(url: str, **kwargs) -> str

使用 ScrapeGraph 将网页内容转换为 Markdown 格式。

参数：

名称	类型	描述	默认
`url`	`str`	要转换为Markdown的目标网站URL	required
`**kwargs`		markdownify 操作的附加参数	`{}`

返回：

名称	类型	描述
`str`	`str`	网页内容的Markdown表示

workflows/handler.py 中的源代码llama_index/tools/scrapegraph/base.py

def scrapegraph_markdownify(self, url: str, **kwargs) -> str:
    """
    Convert webpage content to markdown format using ScrapeGraph.

    Args:
        url (str): Target website URL to convert to markdown
        **kwargs: Additional parameters for the markdownify operation

    Returns:
        str: Markdown representation of the webpage content

    """
    try:
        return self.client.markdownify(website_url=url, **kwargs)
    except Exception as e:
        return f"Markdownify failed: {e!s}"

scrapegraph_search #

scrapegraph_search(query: str, max_results: Optional[int] = None, **kwargs) -> str

使用 ScrapeGraph 的搜索功能执行搜索查询。

参数：

名称	类型	描述	默认
`query`	`str`	要执行的搜索查询	required
`max_results`	`Optional[int]`	返回的最大搜索结果数量	`None`
`**kwargs`		搜索操作的附加参数	`{}`

返回：

名称	类型	描述
`str`	`str`	来自 ScrapeGraph 的搜索结果

workflows/handler.py 中的源代码llama_index/tools/scrapegraph/base.py

def scrapegraph_search(
    self, query: str, max_results: Optional[int] = None, **kwargs
) -> str:
    """
    Perform a search query using ScrapeGraph's search functionality.

    Args:
        query (str): Search query to execute
        max_results (Optional[int]): Maximum number of search results to return
        **kwargs: Additional parameters for the search operation

    Returns:
        str: Search results from ScrapeGraph

    """
    try:
        search_params = {"query": query}
        if max_results:
            search_params["max_results"] = max_results
        search_params.update(kwargs)

        return self.client.search(**search_params)
    except Exception as e:
        return f"Search failed: {e!s}"

scrapegraph_scrape #

scrapegraph_scrape(url: str, render_heavy_js: bool = False, headers: Optional[Dict[str, str]] = None, **kwargs) -> Dict[str, Any]

使用ScrapeGraph的抓取功能执行基本的HTML抓取。

参数：

名称	类型	描述	默认
`url`	`str`	要抓取的目标网站URL	required
`render_heavy_js`	`bool`	是否启用JavaScript渲染以处理动态内容	`False`
`headers`	`Optional[Dict[str, str]]`	请求中包含的自定义 HTTP 标头	`None`
`**kwargs`		用于抓取操作的附加参数	`{}`

返回：

类型	描述
`Dict[str, Any]`	Dict[str, Any]: 包含抓取的HTML内容和元数据的字典

workflows/handler.py 中的源代码llama_index/tools/scrapegraph/base.py

def scrapegraph_scrape(
    self,
    url: str,
    render_heavy_js: bool = False,
    headers: Optional[Dict[str, str]] = None,
    **kwargs,
) -> Dict[str, Any]:
    """
    Perform basic HTML scraping using ScrapeGraph's scrape functionality.

    Args:
        url (str): Target website URL to scrape
        render_heavy_js (bool): Whether to enable JavaScript rendering for dynamic content
        headers (Optional[Dict[str, str]]): Custom HTTP headers to include in the request
        **kwargs: Additional parameters for the scrape operation

    Returns:
        Dict[str, Any]: Dictionary containing scraped HTML content and metadata

    """
    try:
        scrape_params = {"website_url": url, "render_heavy_js": render_heavy_js}
        if headers:
            scrape_params["headers"] = headers
        scrape_params.update(kwargs)

        return self.client.scrape(**scrape_params)
    except Exception as e:
        return {"error": f"Scrape failed: {e!s}"}

scrapegraph_agentic_scraper #

scrapegraph_agentic_scraper(prompt: str, url: str, schema: Optional[Union[List[BaseModel], Dict[str, Any]]] = None, **kwargs) -> Union[List[Dict], Dict]

执行能够浏览并与网站交互的智能体网页抓取。

参数：

名称	类型	描述	默认
`prompt`	`str`	用户提示描述抓取任务和导航要求	required
`url`	`str`	目标网站的起始抓取URL	required
`schema`	`Optional[Union[List[BaseModel], Dict]]`	定义输出结构的Pydantic模型或字典	`None`
`**kwargs`		智能体爬虫的附加参数	`{}`

返回：

类型	描述
`Union[List[Dict], Dict]`	Union[List[Dict], Dict]: 从智能体导航和提取中抓取的数据

workflows/handler.py 中的源代码llama_index/tools/scrapegraph/base.py

def scrapegraph_agentic_scraper(
    self,
    prompt: str,
    url: str,
    schema: Optional[Union[List[BaseModel], Dict[str, Any]]] = None,
    **kwargs,
) -> Union[List[Dict], Dict]:
    """
    Perform agentic web scraping that can navigate and interact with websites.

    Args:
        prompt (str): User prompt describing the scraping task and navigation requirements
        url (str): Target website URL to start scraping from
        schema (Optional[Union[List[BaseModel], Dict]]): Pydantic models or dict defining output structure
        **kwargs: Additional parameters for the agentic scraper

    Returns:
        Union[List[Dict], Dict]: Scraped data from the agentic navigation and extraction

    """
    try:
        return self.client.agentic_scraper(
            website_url=url, user_prompt=prompt, output_schema=schema, **kwargs
        )
    except Exception as e:
        return {"error": f"Agentic scraper failed: {e!s}"}

选项：成员：- ScrapegraphToolSpec