当前位置：首页 > news >正文

大连本地知识库的搭建--数据收集与预处理_01

news 2025/9/15 20:37:11

1.马蜂窝爬虫

编程语言：Python
爬虫框架：Selenium（用于浏览器自动化）
解析库：BeautifulSoup（用于解析HTML）

2.爬虫策略

目标网站：马蜂窝（https://www.mafengwo.cn/）
目标数据：大连的攻略和游记
流程概述：
1. 打开马蜂窝首页并进行搜索。
2. 提取搜索结果页面中的攻略和游记链接。
3. 分别访问每个攻略和游记页面，提取内容并保存到本地文件。

3.导入的库

import randomimport timefrom bs4 import BeautifulSoupfrom selenium import webdriverfrom selenium.webdriver.chrome.service import Servicefrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECimport os

4.配置文件路径和ChromeDriver路径，下载对应Chrome浏览器版本的chromedriver.exe文件

file_path = 'D:\\Pycharm\\space\\Mafengwo\\mafengwo_01.txt'  # 保存内容的文件路径chrome_driver_path = "D:\\chromedriver\\chromedriver.exe"  # ChromeDriver路径

5.为了防止爬取的时候出现中断，采取get_links(file_path)函数从文件中读取已爬取的链接来解决中途爬取中断的问题，从而在上次中断的位置继续爬取。

def get_links(file_path):"""从文件中读取已爬取的链接"""if not os.path.exists(file_path):return set()with open(file_path, 'r', encoding='utf-8') as f:links = f.read().splitlines()return set(links)def save_link(file_path, link):"""将链接保存到文件"""with open(file_path, 'a', encoding='utf-8') as f:f.write(link + '\n')# 已爬取的链接文件already_crawled_file = 'already_crawled_01.txt'

6.初始化Selenium WebDriver

s = Service(chrome_driver_path)options = webdriver.ChromeOptions()options.add_experimental_option('excludeSwitches', ['enable-automation'])options.add_argument("--disable-blink-features=AutomationControlled")driver = webdriver.Chrome(service=s, options=options)

7.打开马蜂窝网站并搜索“大连”，crawl_category（）函数获取攻略和游记的链接，crawl_content()函数爬取指定链接的内容，crawl_content()设计了防止网页内容爬取不完整的情况出现和反爬虫机制的情况出现，采用滚轮的方式处理，滚动次数和等待页面加载时间可以根据实际情况来进行调整，最后获得爬取攻略和游记内容。

"""4. 打开马蜂窝网站并搜索“大连”"""driver.get("https://www.mafengwo.cn/")driver.maximize_window()wait = WebDriverWait(driver, 10)wait.until(EC.visibility_of_element_located((By.ID, '_j_index_search_input_all')))start_input = driver.find_element(By.ID, '_j_index_search_input_all')start_input.send_keys('大连')search_button = driver.find_element(By.ID, '_j_index_search_btn_all')search_button.click()time.sleep(random.randint(0, 3))"""5. 获取攻略和游记的链接"""def crawl_category(category_link):"""爬取指定分类下的攻略或游记链接"""driver.get(category_link)wait = WebDriverWait(driver, 10)wait.until(EC.visibility_of_element_located((By.ID, '_j_search_result_left')))outer_div = driver.find_element(By.ID, '_j_search_result_left')a_elements = outer_div.find_elements(By.XPATH, './/div[@class="flt1"]/a')hrefs = [a_element.get_attribute('href') for a_element in a_elements]already_crawled = get_links(already_crawled_file)for href in hrefs:if href not in already_crawled:save_link(already_crawled_file, href)crawl_content(href)def crawl_content(link):"""爬取指定链接的内容"""driver.get(link)wait = WebDriverWait(driver, 10)# 滚动页面加载更多内容for _ in range(7):  # 滚动次数，根据实际情况调整driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")time.sleep(random.randint(1, 3))  # 等待页面加载if "gl" in link:wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'sideL')))soup = BeautifulSoup(driver.page_source, 'html.parser')content_box = soup.find('div', class_='sideL')else:wait.until(EC.visibility_of_element_located((By.CLASS_NAME, '_j_content_box')))soup = BeautifulSoup(driver.page_source, 'html.parser')content_box = soup.find(class_="_j_content_box")all_text = ' '.join(content_box.stripped_strings)with open(file_path, 'a', encoding='utf-8') as file:file.write(all_text + '\n')time.sleep(random.randint(0, 3))# 获取攻略和游记的链接element = driver.find_element(By.XPATH, '//*[@id="_j_mfw_search_main"]/div[1]/div/div/a[3]')notes = element.get_attribute('href')element = driver.find_element(By.XPATH, '//*[@id="_j_mfw_search_main"]/div[1]/div/div/a[4]')guides = element.get_attribute('href')# 爬取攻略和游记内容crawl_category(guides)crawl_category(notes)driver.quit()

8.结果展示：