目标页面: https://www.shanghairanking.cn/rankings/bcur/2023
打印在终端
import requests
import json
from urllib.parse import urljoin
from parsel import Selectorurl = 'https://www.shanghairanking.cn/rankings/bcur/2023'headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
}data = []def parse():resp = requests.get(url=url, headers=headers)resp.encoding = 'utf8' if resp.status_code == 200:html_data = resp.textselector = Selector(text=html_data)scool_list = selector.xpath('//*[@id="content-box"]/div[2]/table/tbody/tr').getall()for itemn in scool_list:scool = Selector(text=itemn) data.append({'排名': scool.xpath('.//td[1]//text()').get().strip(),'校名': scool.css('.name-cn::text').get().strip(),'详情': urljoin(url, scool.css('a::attr(href)').get().strip()),'城市': scool.xpath('.//td[3]/text()').get().strip(),'评分': scool.xpath('.//td[5]/text()').get().strip()}) print(json.dumps(data, indent=4, ensure_ascii=False))else:print('状态码不为200: ', resp.text)if __name__ == '__main__':parse()

目标页面: https://movie.douban.com/top250
打印终端
import requests
import json
from urllib.parse import urljoin
from parsel import Selectorurl = 'https://movie.douban.com/top250?'headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
}data = []def parse():resp = requests.get(url, headers=headers)if resp.status_code == 200:html_data = resp.textselector = Selector(text=html_data)move_html_list = selector.css('#content > div > div.article > ol > li').getall()for item in move_html_list:move = Selector(text=item)data.append({'排名': move.xpath('.//em/text()').get().strip(),'电影名称': move.css('div > div.info > div.hd > a > span:nth-child(1)::text').get().strip(),'评分': move.css('div > div.info > div.bd > div > span.rating_num::text').get().strip(),'详情链接': move.css('div > div.pic > a::attr(href)').get().strip(),'图片链接': move.css('div > div.pic > a > img::attr(src)').get().strip()})print(json.dumps(data, indent=4, ensure_ascii=False))else:print('状态码不为200', resp.text)if __name__ == '__main__':parse()
