爬虫练习_01
前言
基础爬虫小练习01
一、requests板块使用
demo_01
import requests
from lxml import etreeurl = "https://movie.douban.com/top250"
headers = {"authority": "movie.douban.com","method": "GET","path": "/top250","scheme": "https","accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","cookie": "bid=cqDgOHLreU0; _pk_id.100001.4cf6=a48c68336a0dbbc8.1723366816.; _pk_ses.100001.4cf6=1; __utma=30149280.1388041158.1723366816.1723366816.1723366816.1; __utmb=30149280.0.10.1723366816; __utmc=30149280; __utmz=30149280.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.165463845.1723366816.1723366816.1723366816.1; __utmb=223695111.0.10.1723366816; __utmc=223695111; __utmz=223695111.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); ap_v=0,6.0; __yadk_uid=RjXMEHOKLnUzT6aAzCxf17A5kcLgeeL9","priority": "u=0, i","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "document","sec-fetch-mode": "navigate","sec-fetch-site": "none","sec-fetch-user": "?1","upgrade-insecure-requests": "1","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}resp = requests.get(url=url, headers=headers)page = etree.HTML(resp.text)
rts = page.xpath("//ol[@class='grid_view']/li/div[@class='item']")for rt in rts:title = rt.xpath(".//span[@class='title']/text()")[0]score = rt.xpath(".//span[@class='rating_num']/text()")[0]print(title, score)
demo_02
# https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=
import requests
import lxmlurl = "https://movie.douban.com/j/chart/top_list"
headers = {"authority": "movie.douban.com","method": "GET","path": "/j/chart/top_list?type=13&interval_id=100%3A90&action=&start=0&limit=20","scheme": "https","accept": "*/*","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","cookie": "bid=cqDgOHLreU0; _pk_id.100001.4cf6=a48c68336a0dbbc8.1723366816.; _pk_ses.100001.4cf6=1; __utma=30149280.1388041158.1723366816.1723366816.1723366816.1; __utmb=30149280.0.10.1723366816; __utmc=30149280; __utmz=30149280.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.165463845.1723366816.1723366816.1723366816.1; __utmb=223695111.0.10.1723366816; __utmc=223695111; __utmz=223695111.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); ap_v=0,6.0; __yadk_uid=RjXMEHOKLnUzT6aAzCxf17A5kcLgeeL9; ll=\"108296\"; _vwo_uuid_v2=DA8FF1B0948EE9728736018FE9DFF12E8|1eb76b564769076007eeb2dd472eae01","priority": "u=1, i","referer": "https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "empty","sec-fetch-mode": "cors","sec-fetch-site": "same-origin","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36","x-requested-with": "XMLHttpRequest","Enhanced": "'Never pause here'","The": "network throttling presets are updated with fast and slow 4G.","New": "scroll snap event listeners","Updated": "network throttling presets"
}
my_params = {"type": "13","interval_id": "100:90","action": "","start": "0","limit": "20"
}page = requests.get(url=url, params=my_params, headers=headers)
# print(page.text)
# print(page.request.url)
print(page.json())
demo_03
# https://www.iciba.com/translateimport requestsurl = "https://ifanyi.iciba.com/index.php"
my_headers = {"authority": "ifanyi.iciba.com","method": "POST","path": "/index.php?c=trans&m=fy&client=6&auth_user=key_web_new_fanyi&sign=SioZA5jWOFlUevETjhAhp9RriqVIAJSQ%2BxmfU0q7dIE%3D","scheme": "https","accept": "application/json, text/plain, */*","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","content-length": "34","content-type": "application/x-www-form-urlencoded","origin": "https://www.iciba.com","priority": "u=1, i","referer": "https://www.iciba.com/","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "empty","sec-fetch-mode": "cors","sec-fetch-site": "same-site","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36","Enhanced": "'Never pause here'","The": "network throttling presets are updated with fast and slow 4G.","New": "scroll snap event listeners","Updated": "network throttling presets"
}my_param = {"c": "trans","m": "fy","client": "6","auth_user": "key_web_new_fanyi","sign": "SioZA5jWOFlUevETjhAhp9RriqVIAJSQ+xmfU0q7dIE="
}form_data = {"from": "auto","to": "auto","q": "i love you"
}resp = requests.post(url, params=my_param, data=form_data, headers=my_headers)
print(resp.json())
demo_04
# 下载一张图片
import requestsurl = "https://img.yituyu.com/gallery/8234/00_llcUYWmo.jpg"headers = {"accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","connection": "keep-alive","cookie": "yituyu_first_time=1723381635000; yituyu_os=Windows%20NT%2010.0%3B%20Win64%3B%20x64; Hm_lvt_9714eb07ec1e2c497aefe3d4dfded3ed=1723381639; HMACCOUNT=211F43F3C6950EDF; Hm_lpvt_9714eb07ec1e2c497aefe3d4dfded3ed=1723381807","host": "img.yituyu.com","if-modified-since": "Sat, 15 Apr 2023 14:17:40 GMT","if-none-match": "\"900C5188548CC7C8CBAD301A215FA0D0\"","referer": "https://www.yituyu.com/","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "image","sec-fetch-mode": "no-cors","sec-fetch-site": "same-site","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}resp = requests.get(url, headers=headers)
print(resp.content)with open("tu.jpg", mode="wb") as f:f.write(resp.content)
demo_05
# https://movie.douban.com/review/best/
import requests
from lxml import etree
import reurl = "https://movie.douban.com/review/best/"
my_headers = {"authority": "movie.douban.com","method": "GET","path": "/review/best/","scheme": "https","accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","cache-control": "max-age=0","cookie": "bid=cqDgOHLreU0; _pk_id.100001.4cf6=a48c68336a0dbbc8.1723366816.; __utmc=30149280; __utmz=30149280.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmc=223695111; __utmz=223695111.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __yadk_uid=RjXMEHOKLnUzT6aAzCxf17A5kcLgeeL9; ll=\"108296\"; _vwo_uuid_v2=DA8FF1B0948EE9728736018FE9DFF12E8|1eb76b564769076007eeb2dd472eae01; _pk_ses.100001.4cf6=1; ap_v=0,6.0; __utma=30149280.1388041158.1723366816.1723366816.1723382928.2; __utmb=30149280.0.10.1723382928; __utma=223695111.165463845.1723366816.1723366816.1723382928.2; __utmb=223695111.0.10.1723382928","priority": "u=0, i","referer": "https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "document","sec-fetch-mode": "navigate","sec-fetch-site": "same-origin","sec-fetch-user": "?1","upgrade-insecure-requests": "1","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}resp_1 = requests.get(url, headers=my_headers)# print(resp_1.text)
page = etree.HTML(resp_1.text)
rts = page.xpath("//div[@data-cid]")
print(len(rts))for rt in rts:title = rt.xpath(".//div[@class='main-bd']/h2/a/text()")[0]rid = rt.xpath(".//div[@class='review-short']/@data-rid")[0]# print(title, rid)full_url = f"https://movie.douban.com/j/review/{rid}/full"full_resp = requests.get(full_url, headers=my_headers)body = full_resp.json()["body"]body_page = etree.HTML(body)content = body_page.xpath("//div[@class='review-content clearfix']//text()")content = "".join(content)content = re.sub(r"\s", "", content)print(title)print(content)print("===================================")
总结
request 板块练习01