Skip to content

使用DSPy从文档自动生成代码

本教程演示了如何使用DSPy自动从URL获取文档并为任何库生成可运行的代码示例。该系统能够分析文档网站,提取关键概念,并生成定制化的代码示例。

你将构建什么

一个基于文档驱动的代码生成系统,能够:

  • 从多个URL获取并解析文档
  • 提取API模式、方法及使用示例
  • 为特定用例生成可运行代码
  • 提供解释和最佳实践
  • 可与任何库的文档配合使用

设置

pip install dspy requests beautifulsoup4 html2text

步骤1:文档获取与处理

import dspy
import requests
from bs4 import BeautifulSoup
import html2text
from typing import List, Dict, Any
import json
from urllib.parse import urljoin, urlparse
import time

# Configure DSPy
lm = dspy.LM(model='openai/gpt-4o-mini')
dspy.configure(lm=lm)

class DocumentationFetcher:
    """Fetches and processes documentation from URLs."""

    def __init__(self, max_retries=3, delay=1):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        self.max_retries = max_retries
        self.delay = delay
        self.html_converter = html2text.HTML2Text()
        self.html_converter.ignore_links = False
        self.html_converter.ignore_images = True

    def fetch_url(self, url: str) -> dict[str, str]:
        """Fetch content from a single URL."""
        for attempt in range(self.max_retries):
            try:
                print(f"📡 Fetching: {url} (attempt {attempt + 1})")
                response = self.session.get(url, timeout=10)
                response.raise_for_status()

                soup = BeautifulSoup(response.content, 'html.parser')

                # Remove script and style elements
                for script in soup(["script", "style", "nav", "footer", "header"]):
                    script.decompose()

                # Convert to markdown for better LLM processing
                markdown_content = self.html_converter.handle(str(soup))

                return {
                    "url": url,
                    "title": soup.title.string if soup.title else "No title",
                    "content": markdown_content,
                    "success": True
                }

            except Exception as e:
                print(f"❌ Error fetching {url}: {e}")
                if attempt < self.max_retries - 1:
                    time.sleep(self.delay)
                else:
                    return {
                        "url": url,
                        "title": "Failed to fetch",
                        "content": f"Error: {str(e)}",
                        "success": False
                    }

        return {"url": url, "title": "Failed", "content": "", "success": False}

    def fetch_documentation(self, urls: list[str]) -> list[dict[str, str]]:
        """Fetch documentation from multiple URLs."""
        results = []

        for url in urls:
            result = self.fetch_url(url)
            results.append(result)
            time.sleep(self.delay)  # Be respectful to servers

        return results

class LibraryAnalyzer(dspy.Signature):
    """Analyze library documentation to understand core concepts and patterns."""
    library_name: str = dspy.InputField(desc="Name of the library to analyze")
    documentation_content: str = dspy.InputField(desc="Combined documentation content")

    core_concepts: list[str] = dspy.OutputField(desc="Main concepts and components")
    common_patterns: list[str] = dspy.OutputField(desc="Common usage patterns")
    key_methods: list[str] = dspy.OutputField(desc="Important methods and functions")
    installation_info: str = dspy.OutputField(desc="Installation and setup information")
    code_examples: list[str] = dspy.OutputField(desc="Example code snippets found")

class CodeGenerator(dspy.Signature):
    """Generate code examples for specific use cases using the target library."""
    library_info: str = dspy.InputField(desc="Library concepts and patterns")
    use_case: str = dspy.InputField(desc="Specific use case to implement")
    requirements: str = dspy.InputField(desc="Additional requirements or constraints")

    code_example: str = dspy.OutputField(desc="Complete, working code example")
    explanation: str = dspy.OutputField(desc="Step-by-step explanation of the code")
    best_practices: list[str] = dspy.OutputField(desc="Best practices and tips")
    imports_needed: list[str] = dspy.OutputField(desc="Required imports and dependencies")

class DocumentationLearningAgent(dspy.Module):
    """Agent that learns from documentation URLs and generates code examples."""

    def __init__(self):
        super().__init__()
        self.fetcher = DocumentationFetcher()
        self.analyze_docs = dspy.ChainOfThought(LibraryAnalyzer)
        self.generate_code = dspy.ChainOfThought(CodeGenerator)
        self.refine_code = dspy.ChainOfThought(
            "code, feedback -> improved_code: str, changes_made: list[str]"
        )

    def learn_from_urls(self, library_name: str, doc_urls: list[str]) -> Dict:
        """Learn about a library from its documentation URLs."""

        print(f"📚 Learning about {library_name} from {len(doc_urls)} URLs...")

        # Fetch all documentation
        docs = self.fetcher.fetch_documentation(doc_urls)

        # Combine successful fetches
        combined_content = "\n\n---\n\n".join([
            f"URL: {doc['url']}\nTitle: {doc['title']}\n\n{doc['content']}"
            for doc in docs if doc['success']
        ])

        if not combined_content:
            raise ValueError("No documentation could be fetched successfully")

        # Analyze combined documentation
        analysis = self.analyze_docs(
            library_name=library_name,
            documentation_content=combined_content
        )

        return {
            "library": library_name,
            "source_urls": [doc['url'] for doc in docs if doc['success']],
            "core_concepts": analysis.core_concepts,
            "patterns": analysis.common_patterns,
            "methods": analysis.key_methods,
            "installation": analysis.installation_info,
            "examples": analysis.code_examples,
            "fetched_docs": docs
        }

    def generate_example(self, library_info: Dict, use_case: str, requirements: str = "") -> Dict:
        """Generate a code example for a specific use case."""

        # Format library information for the generator
        info_text = f"""
        Library: {library_info['library']}
        Core Concepts: {', '.join(library_info['core_concepts'])}
        Common Patterns: {', '.join(library_info['patterns'])}
        Key Methods: {', '.join(library_info['methods'])}
        Installation: {library_info['installation']}
        Example Code Snippets: {'; '.join(library_info['examples'][:3])}  # First 3 examples
        """

        code_result = self.generate_code(
            library_info=info_text,
            use_case=use_case,
            requirements=requirements
        )

        return {
            "code": code_result.code_example,
            "explanation": code_result.explanation,
            "best_practices": code_result.best_practices,
            "imports": code_result.imports_needed
        }

# Initialize the learning agent
agent = DocumentationLearningAgent()

步骤2:从文档URL中学习

def learn_library_from_urls(library_name: str, documentation_urls: list[str]) -> Dict:
    """Learn about any library from its documentation URLs."""

    try:
        library_info = agent.learn_from_urls(library_name, documentation_urls)

        print(f"\n🔍 Library Analysis Results for {library_name}:")
        print(f"Sources: {len(library_info['source_urls'])} successful fetches")
        print(f"Core Concepts: {library_info['core_concepts']}")
        print(f"Common Patterns: {library_info['patterns']}")
        print(f"Key Methods: {library_info['methods']}")
        print(f"Installation: {library_info['installation']}")
        print(f"Found {len(library_info['examples'])} code examples")

        return library_info

    except Exception as e:
        print(f"❌ Error learning library: {e}")
        raise

# Example 1: Learn FastAPI from official documentation
fastapi_urls = [
    "https://fastapi.tiangolo.com/",
    "https://fastapi.tiangolo.com/tutorial/first-steps/",
    "https://fastapi.tiangolo.com/tutorial/path-params/",
    "https://fastapi.tiangolo.com/tutorial/query-params/"
]

print("🚀 Learning FastAPI from official documentation...")
fastapi_info = learn_library_from_urls("FastAPI", fastapi_urls)

# Example 2: Learn a different library (you can replace with any library)
streamlit_urls = [
    "https://docs.streamlit.io/",
    "https://docs.streamlit.io/get-started",
    "https://docs.streamlit.io/develop/api-reference"
]

print("\n\n📊 Learning Streamlit from official documentation...")
streamlit_info = learn_library_from_urls("Streamlit", streamlit_urls)

步骤3:生成代码示例

def generate_examples_for_library(library_info: Dict, library_name: str):
    """Generate code examples for any library based on its documentation."""

    # Define generic use cases that can apply to most libraries
    use_cases = [
        {
            "name": "Basic Setup and Hello World",
            "description": f"Create a minimal working example with {library_name}",
            "requirements": "Include installation, imports, and basic usage"
        },
        {
            "name": "Common Operations",
            "description": f"Demonstrate the most common {library_name} operations",
            "requirements": "Show typical workflow and best practices"
        },
        {
            "name": "Advanced Usage",
            "description": f"Create a more complex example showcasing {library_name} capabilities",
            "requirements": "Include error handling and optimization"
        }
    ]

    generated_examples = []

    print(f"\n🔧 Generating examples for {library_name}...")

    for use_case in use_cases:
        print(f"\n📝 {use_case['name']}")
        print(f"Description: {use_case['description']}")

        example = agent.generate_example(
            library_info=library_info,
            use_case=use_case['description'],
            requirements=use_case['requirements']
        )

        print("\n💻 Generated Code:")
        print("```python")
        print(example['code'])
        print("```")

        print("\n📦 Required Imports:")
        for imp in example['imports']:
            print(f"  • {imp}")

        print("\n📝 Explanation:")
        print(example['explanation'])

        print("\n✅ Best Practices:")
        for practice in example['best_practices']:
            print(f"  • {practice}")

        generated_examples.append({
            "use_case": use_case['name'],
            "code": example['code'],
            "imports": example['imports'],
            "explanation": example['explanation'],
            "best_practices": example['best_practices']
        })

        print("-" * 80)

    return generated_examples

# Generate examples for both libraries
print("🎯 Generating FastAPI Examples:")
fastapi_examples = generate_examples_for_library(fastapi_info, "FastAPI")

print("\n\n🎯 Generating Streamlit Examples:")
streamlit_examples = generate_examples_for_library(streamlit_info, "Streamlit")

步骤4:交互式库学习功能

def learn_any_library(library_name: str, documentation_urls: list[str], use_cases: list[str] = None):
    """Learn any library from its documentation and generate examples."""

    if use_cases is None:
        use_cases = [
            "Basic setup and hello world example",
            "Common operations and workflows",
            "Advanced usage with best practices"
        ]

    print(f"🚀 Starting automated learning for {library_name}...")
    print(f"Documentation sources: {len(documentation_urls)} URLs")

    try:
        # Step 1: Learn from documentation
        library_info = agent.learn_from_urls(library_name, documentation_urls)

        # Step 2: Generate examples for each use case
        all_examples = []

        for i, use_case in enumerate(use_cases, 1):
            print(f"\n📝 Generating example {i}/{len(use_cases)}: {use_case}")

            example = agent.generate_example(
                library_info=library_info,
                use_case=use_case,
                requirements="Include error handling, comments, and follow best practices"
            )

            all_examples.append({
                "use_case": use_case,
                "code": example['code'],
                "imports": example['imports'],
                "explanation": example['explanation'],
                "best_practices": example['best_practices']
            })

        return {
            "library_info": library_info,
            "examples": all_examples
        }

    except Exception as e:
        print(f"❌ Error learning {library_name}: {e}")
        return None

def interactive_learning_session():
    """Interactive session for learning libraries with user input."""

    print("🎯 Welcome to the Interactive Library Learning System!")
    print("This system will help you learn any Python library from its documentation.\n")

    learned_libraries = {}

    while True:
        print("\n" + "="*60)
        print("🚀 LIBRARY LEARNING SESSION")
        print("="*60)

        # Get library name from user
        library_name = input("\n📚 Enter the library name you want to learn (or 'quit' to exit): ").strip()

        if library_name.lower() in ['quit', 'exit', 'q']:
            print("\n👋 Thanks for using the Interactive Library Learning System!")
            break

        if not library_name:
            print("❌ Please enter a valid library name.")
            continue

        # Get documentation URLs
        print(f"\n🔗 Enter documentation URLs for {library_name} (one per line, empty line to finish):")
        urls = []
        while True:
            url = input("  URL: ").strip()
            if not url:
                break
            if not url.startswith(('http://', 'https://')):
                print("    ⚠️  Please enter a valid URL starting with http:// or https://")
                continue
            urls.append(url)

        if not urls:
            print("❌ No valid URLs provided. Skipping this library.")
            continue

        # Get custom use cases from user
        print(f"\n🎯 Define use cases for {library_name} (optional, press Enter for defaults):")
        print("   Default use cases will be: Basic setup, Common operations, Advanced usage")

        user_wants_custom = input("   Do you want to define custom use cases? (y/n): ").strip().lower()

        use_cases = None
        if user_wants_custom in ['y', 'yes']:
            print("   Enter your use cases (one per line, empty line to finish):")
            use_cases = []
            while True:
                use_case = input("     Use case: ").strip()
                if not use_case:
                    break
                use_cases.append(use_case)

            if not use_cases:
                print("   No custom use cases provided, using defaults.")
                use_cases = None

        # Learn the library
        print(f"\n🚀 Starting learning process for {library_name}...")
        result = learn_any_library(library_name, urls, use_cases)

        if result:
            learned_libraries[library_name] = result
            print(f"\n✅ Successfully learned {library_name}!")

            # Show summary
            print(f"\n📊 Learning Summary for {library_name}:")
            print(f"   • Core concepts: {len(result['library_info']['core_concepts'])} identified")
            print(f"   • Common patterns: {len(result['library_info']['patterns'])} found")
            print(f"   • Examples generated: {len(result['examples'])}")

            # Ask if user wants to see examples
            show_examples = input(f"\n👀 Do you want to see the generated examples for {library_name}? (y/n): ").strip().lower()

            if show_examples in ['y', 'yes']:
                for i, example in enumerate(result['examples'], 1):
                    print(f"\n{'─'*50}")
                    print(f"📝 Example {i}: {example['use_case']}")
                    print(f"{'─'*50}")

                    print("\n💻 Generated Code:")
                    print("```python")
                    print(example['code'])
                    print("```")

                    print(f"\n📦 Required Imports:")
                    for imp in example['imports']:
                        print(f"  • {imp}")

                    print(f"\n📝 Explanation:")
                    print(example['explanation'])

                    print(f"\n✅ Best Practices:")
                    for practice in example['best_practices']:
                        print(f"  • {practice}")

                    # Ask if user wants to see the next example
                    if i < len(result['examples']):
                        continue_viewing = input(f"\nContinue to next example? (y/n): ").strip().lower()
                        if continue_viewing not in ['y', 'yes']:
                            break

            # Offer to save results
            save_results = input(f"\n💾 Save learning results for {library_name} to file? (y/n): ").strip().lower()

            if save_results in ['y', 'yes']:
                filename = input(f"   Enter filename (default: {library_name.lower()}_learning.json): ").strip()
                if not filename:
                    filename = f"{library_name.lower()}_learning.json"

                try:
                    import json
                    with open(filename, 'w') as f:
                        json.dump(result, f, indent=2, default=str)
                    print(f"   ✅ Results saved to {filename}")
                except Exception as e:
                    print(f"   ❌ Error saving file: {e}")

        else:
            print(f"❌ Failed to learn {library_name}")

        # Ask if user wants to learn another library
        print(f"\n📚 Libraries learned so far: {list(learned_libraries.keys())}")
        continue_learning = input("\n🔄 Do you want to learn another library? (y/n): ").strip().lower()

        if continue_learning not in ['y', 'yes']:
            break

    # Final summary
    if learned_libraries:
        print(f"\n🎉 Session Summary:")
        print(f"Successfully learned {len(learned_libraries)} libraries:")
        for lib_name, info in learned_libraries.items():
            print(f"  • {lib_name}: {len(info['examples'])} examples generated")

    return learned_libraries

# Example: Run interactive learning session
if __name__ == "__main__":
    # Run interactive session
    learned_libraries = interactive_learning_session()

示例输出

当你运行交互式学习系统时,你会看到:

交互会话开始:

🎯 欢迎使用交互式库学习系统!
该系统将帮助您从文档中学习任何Python库。

============================================================
🚀 库学习会话
============================================================

📚 输入您想学习的库名称(或输入'quit'退出):FastAPI

🔗 输入FastAPI的文档URL(每行一个,空行结束):
  URL: https://fastapi.tiangolo.com/
  URL: https://fastapi.tiangolo.com/tutorial/first-steps/
  URL: https://fastapi.tiangolo.com/tutorial/path-params/
  URL: 

🎯 定义FastAPI的用例(可选,按Enter使用默认值):
   默认用例为:基础设置、常用操作、高级用法
   是否要定义自定义用例?(y/n):y
   输入您的用例(每行一个,空行结束):
     用例:创建带身份验证的REST API
     用例:构建文件上传端点
     用例:添加与SQLAlchemy的数据库集成
     用例:

文档处理:

🚀 开始FastAPI的学习进程...
🚀 开始FastAPI的自动化学习...
文档来源:3个URL
📡 获取中:https://fastapi.tiangolo.com/ (尝试1)
📡 获取中:https://fastapi.tiangolo.com/tutorial/first-steps/ (尝试1)
📡 获取中:https://fastapi.tiangolo.com/tutorial/path-params/ (尝试1)
📚 从3个URL学习FastAPI...

🔍 FastAPI库分析结果:
来源:3次成功获取
核心概念:['FastAPI应用', '路径操作', '依赖项', '请求/响应模型']
常见模式:['app = FastAPI()', '基于装饰器的路由', 'Pydantic模型']
关键方法:['FastAPI()', '@app.get()', '@app.post()', 'uvicorn.run()']
安装:pip install fastapi uvicorn

Code Generation:

📝 Generating example 1/3: Create a REST API with authentication

✅ Successfully learned FastAPI!

📊 Learning Summary for FastAPI:
   • Core concepts: 4 identified
   • Common patterns: 3 found
   • Examples generated: 3

👀 Do you want to see the generated examples for FastAPI? (y/n): y

──────────────────────────────────────────────────
📝 Example 1: Create a REST API with authentication
──────────────────────────────────────────────────

💻 Generated Code:
from fastapi import FastAPI, Depends, HTTPException, status
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
import uvicorn
from typing import Dict
import jwt
from datetime import datetime, timedelta

app = FastAPI(title="Authenticated API", version="1.0.0")
security = HTTPBearer()

# Secret key for JWT (use environment variable in production)
SECRET_KEY = "your-secret-key-here"
ALGORITHM = "HS256"

def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
    try:
        payload = jwt.decode(credentials.credentials, SECRET_KEY, algorithms=[ALGORITHM])
        username: str = payload.get("sub")
        if username is None:
            raise HTTPException(status_code=401, detail="Invalid token")
        return username
    except jwt.PyJWTError:
        raise HTTPException(status_code=401, detail="Invalid token")

@app.post("/login")
async def login(username: str, password: str) -> dict[str, str]:
    # In production, verify against database
    if username == "admin" and password == "secret":
        token_data = {"sub": username, "exp": datetime.utcnow() + timedelta(hours=24)}
        token = jwt.encode(token_data, SECRET_KEY, algorithm=ALGORITHM)
        return {"access_token": token, "token_type": "bearer"}
    raise HTTPException(status_code=401, detail="Invalid credentials")

@app.get("/protected")
async def protected_route(current_user: str = Depends(verify_token)) -> dict[str, str]:
    return {"message": f"Hello {current_user}! This is a protected route."}

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)

📦 Required Imports:
  • pip install fastapi uvicorn python-jose[cryptography]
  • from fastapi import FastAPI, Depends, HTTPException, status
  • from fastapi.security import HTTPBearer
  • import jwt

📝 Explanation:
This example creates a FastAPI application with JWT-based authentication. It includes a login endpoint that returns a JWT token and a protected route that requires authentication...

✅ Best Practices:
  • Use environment variables for secret keys
  • Implement proper password hashing in production
  • Add token expiration and refresh logic
  • Include proper error handling

Continue to next example? (y/n): n

💾 Save learning results for FastAPI to file? (y/n): y
   Enter filename (default: fastapi_learning.json): 
   ✅ Results saved to fastapi_learning.json

📚 Libraries learned so far: ['FastAPI']

🔄 Do you want to learn another library? (y/n): n

🎉 Session Summary:
Successfully learned 1 libraries:
  • FastAPI: 3 examples generated

后续步骤

  • GitHub 集成: 从 README 文件和示例仓库中学习
  • 视频教程处理: 从视频文档中提取信息
  • 社区示例: 汇总来自Stack Overflow和论坛的示例
  • 版本对比: 跟踪库版本间的API变更
  • 测试生成: 自动为生成的代码创建单元测试
  • 页面抓取: 自动抓取文档页面以主动了解使用方式

本教程展示了DSPy如何自动化从文档中学习不熟悉库的整个过程,使其对快速技术采用和探索非常有价值。

优云智算