跳转到内容

网页抓取图谱

网页抓取图工具规范 #

基类:EventBaseToolSpec

用于网页抓取操作的ScrapeGraph工具规范。

该工具提供对 ScrapeGraph AI 网络爬取功能的访问, 包括智能爬取、内容转换为 Markdown 格式、搜索功能, 以及具有多种选项的基础 HTML 爬取。

workflows/handler.py 中的源代码llama_index/tools/scrapegraph/base.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
class ScrapegraphToolSpec(BaseToolSpec):
    """
    ScrapeGraph tool specification for web scraping operations.

    This tool provides access to ScrapeGraph AI's web scraping capabilities,
    including smart scraping, content conversion to markdown, search functionality,
    and basic HTML scraping with various options.
    """

    spec_functions = [
        "scrapegraph_smartscraper",
        "scrapegraph_markdownify",
        "scrapegraph_search",
        "scrapegraph_scrape",
        "scrapegraph_agentic_scraper",
    ]

    def __init__(self, api_key: Optional[str] = None) -> None:
        """
        Initialize the ScrapeGraph tool specification.

        Args:
            api_key (Optional[str]): ScrapeGraph API key. If not provided,
                                   will attempt to load from environment variable SGAI_API_KEY.

        """
        if api_key:
            self.client = Client(api_key=api_key)
        else:
            self.client = Client.from_env()

    def scrapegraph_smartscraper(
        self,
        prompt: str,
        url: str,
        schema: Optional[Union[List[BaseModel], Dict[str, Any]]] = None,
        **kwargs,
    ) -> Union[List[Dict], Dict]:
        """
        Perform intelligent web scraping using ScrapeGraph's SmartScraper.

        Args:
            prompt (str): User prompt describing what data to extract from the webpage
            url (str): Target website URL to scrape
            schema (Optional[Union[List[BaseModel], Dict]]): Pydantic models or dict defining output structure
            **kwargs: Additional parameters for the SmartScraper

        Returns:
            Union[List[Dict], Dict]: Scraped data matching the provided schema or prompt requirements

        """
        try:
            return self.client.smartscraper(
                website_url=url, user_prompt=prompt, output_schema=schema, **kwargs
            )
        except Exception as e:
            return {"error": f"SmartScraper failed: {e!s}"}

    def scrapegraph_markdownify(self, url: str, **kwargs) -> str:
        """
        Convert webpage content to markdown format using ScrapeGraph.

        Args:
            url (str): Target website URL to convert to markdown
            **kwargs: Additional parameters for the markdownify operation

        Returns:
            str: Markdown representation of the webpage content

        """
        try:
            return self.client.markdownify(website_url=url, **kwargs)
        except Exception as e:
            return f"Markdownify failed: {e!s}"

    def scrapegraph_search(
        self, query: str, max_results: Optional[int] = None, **kwargs
    ) -> str:
        """
        Perform a search query using ScrapeGraph's search functionality.

        Args:
            query (str): Search query to execute
            max_results (Optional[int]): Maximum number of search results to return
            **kwargs: Additional parameters for the search operation

        Returns:
            str: Search results from ScrapeGraph

        """
        try:
            search_params = {"query": query}
            if max_results:
                search_params["max_results"] = max_results
            search_params.update(kwargs)

            return self.client.search(**search_params)
        except Exception as e:
            return f"Search failed: {e!s}"

    def scrapegraph_scrape(
        self,
        url: str,
        render_heavy_js: bool = False,
        headers: Optional[Dict[str, str]] = None,
        **kwargs,
    ) -> Dict[str, Any]:
        """
        Perform basic HTML scraping using ScrapeGraph's scrape functionality.

        Args:
            url (str): Target website URL to scrape
            render_heavy_js (bool): Whether to enable JavaScript rendering for dynamic content
            headers (Optional[Dict[str, str]]): Custom HTTP headers to include in the request
            **kwargs: Additional parameters for the scrape operation

        Returns:
            Dict[str, Any]: Dictionary containing scraped HTML content and metadata

        """
        try:
            scrape_params = {"website_url": url, "render_heavy_js": render_heavy_js}
            if headers:
                scrape_params["headers"] = headers
            scrape_params.update(kwargs)

            return self.client.scrape(**scrape_params)
        except Exception as e:
            return {"error": f"Scrape failed: {e!s}"}

    def scrapegraph_agentic_scraper(
        self,
        prompt: str,
        url: str,
        schema: Optional[Union[List[BaseModel], Dict[str, Any]]] = None,
        **kwargs,
    ) -> Union[List[Dict], Dict]:
        """
        Perform agentic web scraping that can navigate and interact with websites.

        Args:
            prompt (str): User prompt describing the scraping task and navigation requirements
            url (str): Target website URL to start scraping from
            schema (Optional[Union[List[BaseModel], Dict]]): Pydantic models or dict defining output structure
            **kwargs: Additional parameters for the agentic scraper

        Returns:
            Union[List[Dict], Dict]: Scraped data from the agentic navigation and extraction

        """
        try:
            return self.client.agentic_scraper(
                website_url=url, user_prompt=prompt, output_schema=schema, **kwargs
            )
        except Exception as e:
            return {"error": f"Agentic scraper failed: {e!s}"}

scrapegraph_smartscraper #

scrapegraph_smartscraper(prompt: str, url: str, schema: Optional[Union[List[BaseModel], Dict[str, Any]]] = None, **kwargs) -> Union[List[Dict], Dict]

使用 ScrapeGraph 的 SmartScraper 执行智能网页抓取。

参数:

名称 类型 描述 默认
prompt str

用户提示描述要从网页中提取哪些数据

required
url str

要抓取的目标网站URL

required
schema Optional[Union[List[BaseModel], Dict]]

定义输出结构的Pydantic模型或字典

None
**kwargs

SmartScraper 的附加参数

{}

返回:

类型 描述
Union[List[Dict], Dict]

Union[List[Dict], Dict]: 符合提供模式或提示要求的抓取数据

workflows/handler.py 中的源代码llama_index/tools/scrapegraph/base.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def scrapegraph_smartscraper(
    self,
    prompt: str,
    url: str,
    schema: Optional[Union[List[BaseModel], Dict[str, Any]]] = None,
    **kwargs,
) -> Union[List[Dict], Dict]:
    """
    Perform intelligent web scraping using ScrapeGraph's SmartScraper.

    Args:
        prompt (str): User prompt describing what data to extract from the webpage
        url (str): Target website URL to scrape
        schema (Optional[Union[List[BaseModel], Dict]]): Pydantic models or dict defining output structure
        **kwargs: Additional parameters for the SmartScraper

    Returns:
        Union[List[Dict], Dict]: Scraped data matching the provided schema or prompt requirements

    """
    try:
        return self.client.smartscraper(
            website_url=url, user_prompt=prompt, output_schema=schema, **kwargs
        )
    except Exception as e:
        return {"error": f"SmartScraper failed: {e!s}"}

scrapegraph_markdownify #

scrapegraph_markdownify(url: str, **kwargs) -> str

使用 ScrapeGraph 将网页内容转换为 Markdown 格式。

参数:

名称 类型 描述 默认
url str

要转换为Markdown的目标网站URL

required
**kwargs

markdownify 操作的附加参数

{}

返回:

名称 类型 描述
str str

网页内容的Markdown表示

workflows/handler.py 中的源代码llama_index/tools/scrapegraph/base.py
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def scrapegraph_markdownify(self, url: str, **kwargs) -> str:
    """
    Convert webpage content to markdown format using ScrapeGraph.

    Args:
        url (str): Target website URL to convert to markdown
        **kwargs: Additional parameters for the markdownify operation

    Returns:
        str: Markdown representation of the webpage content

    """
    try:
        return self.client.markdownify(website_url=url, **kwargs)
    except Exception as e:
        return f"Markdownify failed: {e!s}"
scrapegraph_search(query: str, max_results: Optional[int] = None, **kwargs) -> str

使用 ScrapeGraph 的搜索功能执行搜索查询。

参数:

名称 类型 描述 默认
query str

要执行的搜索查询

required
max_results Optional[int]

返回的最大搜索结果数量

None
**kwargs

搜索操作的附加参数

{}

返回:

名称 类型 描述
str str

来自 ScrapeGraph 的搜索结果

workflows/handler.py 中的源代码llama_index/tools/scrapegraph/base.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def scrapegraph_search(
    self, query: str, max_results: Optional[int] = None, **kwargs
) -> str:
    """
    Perform a search query using ScrapeGraph's search functionality.

    Args:
        query (str): Search query to execute
        max_results (Optional[int]): Maximum number of search results to return
        **kwargs: Additional parameters for the search operation

    Returns:
        str: Search results from ScrapeGraph

    """
    try:
        search_params = {"query": query}
        if max_results:
            search_params["max_results"] = max_results
        search_params.update(kwargs)

        return self.client.search(**search_params)
    except Exception as e:
        return f"Search failed: {e!s}"

scrapegraph_scrape #

scrapegraph_scrape(url: str, render_heavy_js: bool = False, headers: Optional[Dict[str, str]] = None, **kwargs) -> Dict[str, Any]

使用ScrapeGraph的抓取功能执行基本的HTML抓取。

参数:

名称 类型 描述 默认
url str

要抓取的目标网站URL

required
render_heavy_js bool

是否启用JavaScript渲染以处理动态内容

False
headers Optional[Dict[str, str]]

请求中包含的自定义 HTTP 标头

None
**kwargs

用于抓取操作的附加参数

{}

返回:

类型 描述
Dict[str, Any]

Dict[str, Any]: 包含抓取的HTML内容和元数据的字典

workflows/handler.py 中的源代码llama_index/tools/scrapegraph/base.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def scrapegraph_scrape(
    self,
    url: str,
    render_heavy_js: bool = False,
    headers: Optional[Dict[str, str]] = None,
    **kwargs,
) -> Dict[str, Any]:
    """
    Perform basic HTML scraping using ScrapeGraph's scrape functionality.

    Args:
        url (str): Target website URL to scrape
        render_heavy_js (bool): Whether to enable JavaScript rendering for dynamic content
        headers (Optional[Dict[str, str]]): Custom HTTP headers to include in the request
        **kwargs: Additional parameters for the scrape operation

    Returns:
        Dict[str, Any]: Dictionary containing scraped HTML content and metadata

    """
    try:
        scrape_params = {"website_url": url, "render_heavy_js": render_heavy_js}
        if headers:
            scrape_params["headers"] = headers
        scrape_params.update(kwargs)

        return self.client.scrape(**scrape_params)
    except Exception as e:
        return {"error": f"Scrape failed: {e!s}"}

scrapegraph_agentic_scraper #

scrapegraph_agentic_scraper(prompt: str, url: str, schema: Optional[Union[List[BaseModel], Dict[str, Any]]] = None, **kwargs) -> Union[List[Dict], Dict]

执行能够浏览并与网站交互的智能体网页抓取。

参数:

名称 类型 描述 默认
prompt str

用户提示描述抓取任务和导航要求

required
url str

目标网站的起始抓取URL

required
schema Optional[Union[List[BaseModel], Dict]]

定义输出结构的Pydantic模型或字典

None
**kwargs

智能体爬虫的附加参数

{}

返回:

类型 描述
Union[List[Dict], Dict]

Union[List[Dict], Dict]: 从智能体导航和提取中抓取的数据

workflows/handler.py 中的源代码llama_index/tools/scrapegraph/base.py
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def scrapegraph_agentic_scraper(
    self,
    prompt: str,
    url: str,
    schema: Optional[Union[List[BaseModel], Dict[str, Any]]] = None,
    **kwargs,
) -> Union[List[Dict], Dict]:
    """
    Perform agentic web scraping that can navigate and interact with websites.

    Args:
        prompt (str): User prompt describing the scraping task and navigation requirements
        url (str): Target website URL to start scraping from
        schema (Optional[Union[List[BaseModel], Dict]]): Pydantic models or dict defining output structure
        **kwargs: Additional parameters for the agentic scraper

    Returns:
        Union[List[Dict], Dict]: Scraped data from the agentic navigation and extraction

    """
    try:
        return self.client.agentic_scraper(
            website_url=url, user_prompt=prompt, output_schema=schema, **kwargs
        )
    except Exception as e:
        return {"error": f"Agentic scraper failed: {e!s}"}

选项: 成员:- ScrapegraphToolSpec