示例与教程
1. 使用多线程方法构建和爬取新闻源
构建和爬取新闻网站可能需要同时处理多个来源并处理大量文章。您可以通过在爬取时使用多线程显著提升此过程的性能。即使Python并非真正的多线程(由于GIL),i/o请求仍可以并行处理。
from newspaper import Source
from newspaper.mthreading import fetch_news
import threading
class NewsCrawler:
def __init__(self, source_urls, config=None):
self.sources = [Source(url, config=config) for url in source_urls]
self.articles = []
def build_sources(self):
# Multithreaded source building
threads = [threading.Thread(target=source.build) for source in self.sources]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
def crawl_articles(self):
# Multithreaded article downloading
self.articles = fetch_news(self.sources, threads=4)
def extract_information(self):
# Extract information from each article
for source in self.sources:
print(f"Source {source.url}")
for article in source.articles[:10]:
article.parse()
print(f"Title: {article.title}")
print(f"Authors: {article.authors}")
print(f"Text: {article.text[:150]}...") # Printing first 150 characters of text
print("-------------------------------")
if __name__ == "__main__":
source_urls = ['https://slate.com', 'https://time.com'] # Add your news source URLs here
crawler = NewsCrawler(source_urls)
crawler.build_sources()
crawler.crawl_articles()
crawler.extract_information()
2. 使用Scrapy获取文章
安装必要软件包
pip install scrapy
pip install newspaper4k
创建Scrapy项目:
scrapy startproject news_scraper
此命令创建一个名为news_scraper的新文件夹,其中包含必要的Scrapy文件。
编写Scrapy爬虫
导航到 news_scraper/spiders 文件夹并创建一个新的爬虫。例如,news_spider.py:
import scrapy import newspaper class NewsSpider(scrapy.Spider): name = 'news' start_urls = ['https://abcnews.go.com/elections'] # Replace with your target URLs def parse(self, response): # Extract URLs from the response and yield Scrapy Requests for href in response.css('a::attr(href)'): yield response.follow(href, self.parse_article) def parse_article(self, response): # Use Newspaper4k to parse the article article = newspaper.article(response.url, language='en', input_html=response.text) article.parse() article.nlp() # Extracted information yield { 'url': response.url, 'title': article.title, 'authors': article.authors, 'text': article.text, 'publish_date': article.publish_date, 'keywords': article.keywords, 'summary': article.summary, }
运行爬虫
scrapy crawl news -o output.json
3. 使用Playwright抓取使用Javascript构建的网站
安装必要包
pip install newspaper4k
pip install playwright
playwright install
使用Playwright进行爬取
from playwright.sync_api import sync_playwright
import newspaper
import time
def scrape_with_playwright(url):
# Using Playwright to render JavaScript
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(url)
time.sleep(1) # Allow the javascript to render
content = page.content()
browser.close()
# Using Newspaper4k to parse the page content
article = newspaper.article(url, input_html=content, language='en')
return article
# Example URL
url = 'https://ec.europa.eu/commission/presscorner/detail/en/ac_24_84' # Replace with the URL of your choice
# Scrape and process the article
article = scrape_with_playwright(url)
article.nlp()
print(f"Title: {article.title}")
print(f"Authors: {article.authors}")
print(f"Publication Date: {article.publish_date}")
print(f"Summary: {article.summary}")
print(f"Keywords: {article.keywords}")
4. 使用Playwright抓取需要登录的网站
from playwright.sync_api import sync_playwright
import newspaper
def login_and_fetch_article(url, login_url, username, password):
# Using Playwright to handle login and fetch article
with sync_playwright() as p:
browser = p.chromium.launch(headless=True) # Set headless=False to watch the browser actions
page = browser.new_page()
# Automating login
page.goto(login_url)
page.fill('input[name="log"]', username) # Adjust the selector as per the site's HTML
page.fill('input[name="pwd"]', password) # Adjust the selector as per the site's HTML
page.click('input[type="submit"][value="Login"]') # Adjust the selector as per the site's HTML
# Wait for navigation after login
page.wait_for_url('/')
# Navigating to the article
page.goto(url)
content = page.content()
browser.close()
# Using Newspaper4k to parse the page content
article = newspaper.article(url, input_html=content, language='en')
return article
# Example URLs and credentials
login_url = 'https://www.undercurrentnews.com/login/' # Replace with the actual login URL
article_url = 'https://www.undercurrentnews.com/2024/01/08/editors-choice-farmed-shrimp-output-to-drop-in-2024-fallout-from-us-expanded-russia-ban/' # Replace with the URL of the article you want to scrape
username = 'tester_news' # Replace with your username
password = 'test' # Replace with your password
# Fetch and process the article
article = login_and_fetch_article(article_url, login_url, username, password)
article.nlp()
print(f"Title: {article.title}")
print(f"Authors: {article.authors}")
print(f"Publication Date: {article.publish_date}")
print(f"Summary: {article.summary}")
print(f"Keywords: {article.keywords}")