当前位置: 首页 > news >正文

爬虫练习_01


前言

基础爬虫小练习01

一、requests板块使用

demo_01

import requests
from lxml import etreeurl = "https://movie.douban.com/top250"
headers = {"authority": "movie.douban.com","method": "GET","path": "/top250","scheme": "https","accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","cookie": "bid=cqDgOHLreU0; _pk_id.100001.4cf6=a48c68336a0dbbc8.1723366816.; _pk_ses.100001.4cf6=1; __utma=30149280.1388041158.1723366816.1723366816.1723366816.1; __utmb=30149280.0.10.1723366816; __utmc=30149280; __utmz=30149280.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.165463845.1723366816.1723366816.1723366816.1; __utmb=223695111.0.10.1723366816; __utmc=223695111; __utmz=223695111.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); ap_v=0,6.0; __yadk_uid=RjXMEHOKLnUzT6aAzCxf17A5kcLgeeL9","priority": "u=0, i","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "document","sec-fetch-mode": "navigate","sec-fetch-site": "none","sec-fetch-user": "?1","upgrade-insecure-requests": "1","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}resp = requests.get(url=url, headers=headers)page = etree.HTML(resp.text)
rts = page.xpath("//ol[@class='grid_view']/li/div[@class='item']")for rt in rts:title = rt.xpath(".//span[@class='title']/text()")[0]score = rt.xpath(".//span[@class='rating_num']/text()")[0]print(title, score)

demo_02

# https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=
import requests
import lxmlurl = "https://movie.douban.com/j/chart/top_list"
headers = {"authority": "movie.douban.com","method": "GET","path": "/j/chart/top_list?type=13&interval_id=100%3A90&action=&start=0&limit=20","scheme": "https","accept": "*/*","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","cookie": "bid=cqDgOHLreU0; _pk_id.100001.4cf6=a48c68336a0dbbc8.1723366816.; _pk_ses.100001.4cf6=1; __utma=30149280.1388041158.1723366816.1723366816.1723366816.1; __utmb=30149280.0.10.1723366816; __utmc=30149280; __utmz=30149280.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.165463845.1723366816.1723366816.1723366816.1; __utmb=223695111.0.10.1723366816; __utmc=223695111; __utmz=223695111.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); ap_v=0,6.0; __yadk_uid=RjXMEHOKLnUzT6aAzCxf17A5kcLgeeL9; ll=\"108296\"; _vwo_uuid_v2=DA8FF1B0948EE9728736018FE9DFF12E8|1eb76b564769076007eeb2dd472eae01","priority": "u=1, i","referer": "https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "empty","sec-fetch-mode": "cors","sec-fetch-site": "same-origin","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36","x-requested-with": "XMLHttpRequest","Enhanced": "'Never pause here'","The": "network throttling presets are updated with fast and slow 4G.","New": "scroll snap event listeners","Updated": "network throttling presets"
}
my_params = {"type": "13","interval_id": "100:90","action": "","start": "0","limit": "20"
}page = requests.get(url=url, params=my_params, headers=headers)
# print(page.text)
# print(page.request.url)
print(page.json())

demo_03

# https://www.iciba.com/translateimport requestsurl = "https://ifanyi.iciba.com/index.php"
my_headers = {"authority": "ifanyi.iciba.com","method": "POST","path": "/index.php?c=trans&m=fy&client=6&auth_user=key_web_new_fanyi&sign=SioZA5jWOFlUevETjhAhp9RriqVIAJSQ%2BxmfU0q7dIE%3D","scheme": "https","accept": "application/json, text/plain, */*","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","content-length": "34","content-type": "application/x-www-form-urlencoded","origin": "https://www.iciba.com","priority": "u=1, i","referer": "https://www.iciba.com/","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "empty","sec-fetch-mode": "cors","sec-fetch-site": "same-site","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36","Enhanced": "'Never pause here'","The": "network throttling presets are updated with fast and slow 4G.","New": "scroll snap event listeners","Updated": "network throttling presets"
}my_param = {"c": "trans","m": "fy","client": "6","auth_user": "key_web_new_fanyi","sign": "SioZA5jWOFlUevETjhAhp9RriqVIAJSQ+xmfU0q7dIE="
}form_data = {"from": "auto","to": "auto","q": "i love you"
}resp = requests.post(url, params=my_param, data=form_data, headers=my_headers)
print(resp.json())

demo_04

# 下载一张图片
import requestsurl = "https://img.yituyu.com/gallery/8234/00_llcUYWmo.jpg"headers = {"accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","connection": "keep-alive","cookie": "yituyu_first_time=1723381635000; yituyu_os=Windows%20NT%2010.0%3B%20Win64%3B%20x64; Hm_lvt_9714eb07ec1e2c497aefe3d4dfded3ed=1723381639; HMACCOUNT=211F43F3C6950EDF; Hm_lpvt_9714eb07ec1e2c497aefe3d4dfded3ed=1723381807","host": "img.yituyu.com","if-modified-since": "Sat, 15 Apr 2023 14:17:40 GMT","if-none-match": "\"900C5188548CC7C8CBAD301A215FA0D0\"","referer": "https://www.yituyu.com/","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "image","sec-fetch-mode": "no-cors","sec-fetch-site": "same-site","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}resp = requests.get(url, headers=headers)
print(resp.content)with open("tu.jpg", mode="wb") as f:f.write(resp.content)

demo_05

# https://movie.douban.com/review/best/
import requests
from lxml import etree
import reurl = "https://movie.douban.com/review/best/"
my_headers = {"authority": "movie.douban.com","method": "GET","path": "/review/best/","scheme": "https","accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","cache-control": "max-age=0","cookie": "bid=cqDgOHLreU0; _pk_id.100001.4cf6=a48c68336a0dbbc8.1723366816.; __utmc=30149280; __utmz=30149280.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmc=223695111; __utmz=223695111.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __yadk_uid=RjXMEHOKLnUzT6aAzCxf17A5kcLgeeL9; ll=\"108296\"; _vwo_uuid_v2=DA8FF1B0948EE9728736018FE9DFF12E8|1eb76b564769076007eeb2dd472eae01; _pk_ses.100001.4cf6=1; ap_v=0,6.0; __utma=30149280.1388041158.1723366816.1723366816.1723382928.2; __utmb=30149280.0.10.1723382928; __utma=223695111.165463845.1723366816.1723366816.1723382928.2; __utmb=223695111.0.10.1723382928","priority": "u=0, i","referer": "https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "document","sec-fetch-mode": "navigate","sec-fetch-site": "same-origin","sec-fetch-user": "?1","upgrade-insecure-requests": "1","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}resp_1 = requests.get(url, headers=my_headers)# print(resp_1.text)
page = etree.HTML(resp_1.text)
rts = page.xpath("//div[@data-cid]")
print(len(rts))for rt in rts:title = rt.xpath(".//div[@class='main-bd']/h2/a/text()")[0]rid = rt.xpath(".//div[@class='review-short']/@data-rid")[0]# print(title, rid)full_url = f"https://movie.douban.com/j/review/{rid}/full"full_resp = requests.get(full_url, headers=my_headers)body = full_resp.json()["body"]body_page = etree.HTML(body)content = body_page.xpath("//div[@class='review-content clearfix']//text()")content = "".join(content)content = re.sub(r"\s", "", content)print(title)print(content)print("===================================")

总结

request 板块练习01

http://www.lryc.cn/news/421519.html

相关文章:

  • Datawhale X 魔搭 AI夏令营第四期 魔搭-AIGC方向 task02笔记
  • 多模态大语言模型的免训练视觉提示学习 ControlMLLM
  • Oracle|DM 常用|不常用 SQL大口袋
  • 嵌入式软件--模电基础 DAY 1
  • 【Nacos无压力源码领读】(二) 集成 LoadBalancer 与 OpenFeign
  • 《投资的原理》阅读笔记二——价值投资真是王者吗?
  • SSH、FTP、SFTP相关协议详解
  • C语言进阶——一文带你深度了解“C语言关键字”(中篇6)
  • 自建极简Ethercat主站-第8章 FOE基础功能实现
  • SQL Zoo 8.Using Null
  • LeetCode274. H 指数
  • 概述:Dubbo、Nacos、 Zookeeper 等分布式服务协调与治理等技术
  • 【LINUX】小工具降耦合,全内核函数插入宏摸索测试中。。
  • 24/8/12算法笔记 复习_线性回归
  • Linux系统驱动(十四)输入子系统
  • 力扣(2024.08.12)
  • 最新版的AutoGPT,我搭建好了
  • [SWPUCTF 2021 新生赛]PseudoProtocols(构造伪协议)
  • 基于STM32开发的智能语音助手系统
  • 基于python的图像去雾算法研究系统设计与实现
  • 自定义 View 可以播放一段视频
  • LVS负载均衡集群部署之—NAT模式的介绍及搭建步骤
  • 【算法】浅析哈希算法【附代码示例】
  • 2024.8.12
  • 使用Python解析pdf、docx等格式文件。
  • Linux网络通信基础API
  • Python爬虫:下载4K壁纸
  • 2024年【北京市安全员-B证】新版试题及北京市安全员-B证免费试题
  • python爬取B站视频实验
  • 10步搞定Python爬虫从零到精通!