当前位置：首页 > news >正文

Datawhale 网络爬虫技术入门第2次笔记

news 2025/6/20 17:03:17

正则表达式

正则表达式（Regular Expression），⼀种使⽤表达式的⽅式对字符串进⾏匹配的语法规则。

正则的语法：使⽤元字符进⾏排列组合来匹配字符串。

在线测试正则表达式：在线正则表达式测试OSCHINA.NET在线工具,ostools为开发设计人员提供在线工具，提供jsbin在线 CSS、JS 调试，在线 Java API文档,在线 PHP API文档,在线 Node.js API文档,Less CSS编译器，MarkDown编译器等其他在线工具https://tool.oschina.net/regex/

元字符：具有固定含义的特殊符号。

.*? 表示尽可能少的匹配，.*表示尽可能多的匹配。

Re模块

案例1、手刃豆瓣TOP250电影信息

import requests
import re
import csv
import time
import random# 设置请求头
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) ""AppleWebKit/537.36 (KHTML, like Gecko) ""Chrome/114.0.0.0 Safari/537.36","Referer": "https://movie.douban.com/top250"
}# 定义正则表达式
pattern = re.compile(r'<li>.*?<span class="title">(?P<name>.*?)</span>.*?'r'<br>\s*(?P<year>\d{4}).*?'r'<span class="rating_num"[^>]*>(?P<score>\d+\.\d+)</span>.*?'r'<span>(?P<num>[\d,]+)人评价</span>',re.S
)# 创建 CSV 文件并写入表头
with open("douban_top250.csv", mode="w", newline="", encoding="utf-8-sig") as f:csvwriter = csv.writer(f)csvwriter.writerow(["电影名称", "上映年份", "评分", "评分人数"])# 抓取10页（每页25部电影）for start in range(0, 250, 25):url = f"https://movie.douban.com/top250?start={start}"print(f"正在抓取第 {start//25 + 1} 页：{url}")try:resp = requests.get(url, headers=headers, timeout=10)resp.encoding = resp.apparent_encodinghtml = resp.text# 保存页面用于调试（可选）with open(f"page_debug_{start}.html", "w", encoding="utf-8") as f_debug:f_debug.write(html)matches = list(pattern.finditer(html))print(f"✅ 第 {start//25 + 1} 页成功，匹配到 {len(matches)} 条")for m in matches:name = m.group("name").strip()year = m.group("year").strip()score = m.group("score").strip()num = m.group("num").replace(",", "").strip()csvwriter.writerow([name, year, score, num])# 加延时防止被反爬time.sleep(random.uniform(1, 2))except Exception as e:print(f"❌ 抓取失败：{e}")print("🎉 所有页面抓取完毕，已保存到 douban_top250.csv")

结果：

案例2、Quotes to Scrape 爬此网站

import re
import requestsdef scrape_quotes():url = "http://quotes.toscrape.com/"response = requests.get(url)if response.status_code != 200:print("Failed to fetch page")returnhtml = response.text# 正则匹配所有名言区块quotes_pattern = re.compile(r'<div class="quote".*?>(.*?)</div>', re.DOTALL)quotes = quotes_pattern.findall(html)results = []for quote in quotes:# 提取名言文本text_match = re.search(r'<span class="text".*?>(.*?)</span>', quote)text = text_match.group(1).strip() if text_match else "N/A"# 提取作者author_match = re.search(r'<small class="author".*?>(.*?)</small>', quote)author = author_match.group(1).strip() if author_match else "N/A"# 提取标签tags = re.findall(r'<a class="tag".*?>(.*?)</a>', quote)results.append({"text": text,"author": author,"tags": tags})# 打印结果for i, result in enumerate(results, 1):print(f"Quote {i}:")print(f"  Text: {result['text']}")print(f"  Author: {result['author']}")print(f"  Tags: {', '.join(result['tags'])}\n")if __name__ == "__main__":scrape_quotes()

结果显示：

bs4模块

爬虫【唯美图片】唯美意境图片_唯美图片大全_好看的图片唯美 - 优美图库https://www.umei.cc/weimeitupian/

import requests
from bs4 import BeautifulSoup
import os
import time
from urllib.parse import urljoin# 创建图片文件夹（如果不存在）
img_dir = "img"
if not os.path.exists(img_dir):os.makedirs(img_dir)headers = {"User-Agent": "Mozilla/5.0"
}# 已经提取到的子页面链接（你可以替换成真实链接列表）
page_urls = ["https://www.umei.cc/weimeitupian/oumeitupian/331451.htm","https://www.umei.cc/weimeitupian/oumeitupian/331450.htm",# 可继续添加更多链接
]for index, page_url in enumerate(page_urls):try:print(f"正在处理第 {index + 1} 个子页面: {page_url}")resp = requests.get(page_url, headers=headers, timeout=10)resp.encoding = 'utf-8'soup = BeautifulSoup(resp.text, "html.parser")# 有些页面在 .big-pic 或 .content 或 .image 或 #img 都可能出现图片img_tags = soup.find_all("img")img_count = 0for img_tag in img_tags:img_url = img_tag.get("src") or img_tag.get("data-original")if img_url and img_url.startswith("http") and ".jpg" in img_url:img_count += 1filename = os.path.join(img_dir, f"page{index + 1}_{img_count}.jpg")# 下载图片img_data = requests.get(img_url, headers=headers).contentwith open(filename, 'wb') as f:f.write(img_data)print(f"已保存图片：{filename}")if img_count == 0:print("⚠️ 没有找到有效图片")time.sleep(1)  # 防止爬得太快被封except Exception as e:print(f"❌ 处理页面 {page_url} 时出错：{e}")print("🎉 所有子页面处理完成！")

结果显示：