python爬取豆瓣电影评论通用代码
最近在自学python爬虫,今天闲来无事,爬了一下豆瓣数据 这个网站对于初学者来说还是很友好的注意:有python环境的朋友运行的时候,要把cookie换成自己的 通用性:可以自己换不同的电影id进行数据爬取 Tip:sleep方法规避豆瓣反爬检测 话不多说,看效果
短评截图:
影评截图:
短评部分源码:
import csv
import time
import requests
from bs4 import BeautifulSoupdef get_movie_name(movie_id, session):url = f'https://movie.douban.com/subject/{movie_id}/comments'headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0','cookie': 'll="118212"; bid=rFaMYPh6RCA; _pk_id.100001.4cf6=78012081e6779ccd.1750580841.; __yadk_uid=3Gu8udJKmiPINxEc1pMqhsMt3iTILw5x; _vwo_uuid_v2=D1DDB86BDCAAC17D1AC0CE3E327C775A3|2cae6702ce2d9ebdd6552e7f91d2c447; viewed="36492148_31223437_37345711"; __utmz=223695111.1752599205.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmz=30149280.1753857983.17.10.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=30149280; _ga=GA1.1.1274053689.1750580849; _ga_RXNMP372GL=GS2.1.s1753945876$o1$g1$t1753946097$j14$l0$h0; __utmc=223695111; dbcl2="225204235:K9IRNx2LsCg"; ck=l51l; push_noty_num=0; push_doumail_num=0; __utmv=30149280.22520; frodotk_db="bcc9b11a4f11b72a365d134eed35257e"; 6333762c95037d16=mRYvowDmt1XwRDfRGGQV32WFhY7sBklBCONF%2B3p0iam9najnKjbryn3PdaKfalCR7jfxXLUo3suPF3L%2BzLioR1P2RRXONp7n%2FsFc3BbJRF5l9eOMC1IroX8VRsEm3mml0lenQXG9GCPyawm2QXVTyjEiJgHu9R%2B3czAtR11hFGJJwY2LL1sjEjH1vHbyiQ1ZhgiRc2htCncWoQUq2jenlFydIEFHZnp9ZcDgLq58rQF4vRlERDfo%2BTItp2mHWQEd2JpX1u%2BAqXqO1GBGW%2B0sMWeThO8xc6q0pQU2uycmv4toL%2FY7P9CT4w%3D%3D; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1753963128%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Dj5LsrMjHtz4N71TPrjZZFswXuTN2I3JtWMVZOUA1JoavRFidVU8qEt3QhHh89lYn%26wd%3D%26eqid%3Dc34e9cd50159cadd0000000668768aa0%22%5D; _pk_ses.100001.4cf6=1; __utma=30149280.1274053689.1750580849.1753943729.1753963128.23; __utma=223695111.1607170658.1750580849.1753946099.1753963128.13; __utmb=223695111.0.10.1753963128; __utmb=30149280.6.10.1753963128; _TDID_CK=1753964509392'}res = session.get(url, headers=headers)soup = BeautifulSoup(res.text, 'html.parser')movie_name = soup.find('meta', attrs={'name': 'description'}).get('content')return movie_namedef get_short_comments(movie_id):global countt1 = time.time()session = requests.session()movie_name = get_movie_name(movie_id, session)print(f'{movie_name}-短评-开始爬取')f = open(f'{movie_name}_豆瓣.csv', mode='w', encoding='utf-8-sig', newline='')writer = csv.writer(f)writer.writerow(['序号', '用户', '地点', '推荐度', '点赞数', '时间', '影评', '用户主页'])type = 'hml' # h:好评,m:一般,l:差评data = []for t in range(3):for page in range(20):get_short_comments_detail(movie_id, type[t], page, session, data)print(f'(h:好评,m:一般,l:差评) 类型:{type[t]},成功写入{count - 1}条')writer.writerows(data)f.close()session.close()t2 = time.time()print(f'{movie_name}-短评-爬取结束,共有{count - 1}条数据, 耗时:{(t2 - t1) / 60:.2f}分钟')if __name__ == '__main__':movie_id = '36809864' # 南京照相馆 36809864 、 戏台 35483395 、长安的荔枝 36185502count = 1 # 影评序号get_short_comments(movie_id)
影评部分源码:
import csv
import random
import time
import requests
from bs4 import BeautifulSoupdef get_movie_info(movie_id, session, headers):main_url = f'https://movie.douban.com/subject/{movie_id}/reviews'resp = session.get(main_url, headers=headers)soup = BeautifulSoup(resp.text, 'html.parser')movie_name = soup.find('h1').text.split(' ')[0][:-3]max_count = soup.find('h1').text.split(' ')[1][1:-1]print(f'电影: {movie_name}')print(f'影评总数: {max_count}')return [movie_name, max_count]def creat_file(movie_name):f = open(f'影评/{movie_name}_豆瓣影评.csv', 'w', encoding='utf-8-sig', newline='')writer = csv.writer(f)writer.writerow(["影评id", "用户", "ip地址", "时间", "推荐度", "星级", "标题", "影评", "影评中的图片", "赞同", "反对", "收藏","转发", "回应"])print(f'{movie_name},csv文件创建成功')fw = [f, writer]return fwdef get_data(movie_id):t1 = time.time()headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0','cookie': '_TDID_CK=1754050907914; 6333762c95037d16=iKYsKtNQiUl5kslkdyrAZG0WrN5mr8g856oYqDXKywvnAoRVHz4ciIxnMO811Vq4ttNsLQksxwroxQyWShHnAEbjmrRep%2BElkXKB%2BYpO2ULS6NMXtuqe%2FxfP3CmcWNNLo0hxT98%2BFU7sN8Vh8hc6DUvTHdya3aVNLMSg0RQw7ZJ805Rae0aT8YipKiYxwNGG7vVAE2%2FCdcqdZVDTnNgdGp0A2HS5%2F5Xd85gy8orfe2B3V75l2h5bUDZOLpkZ3kqBth%2FxWxZRWILDApqZfz0YYrdx3uxiIV352E%2BpnfTNJTPrBcFzMQBV9A%3D%3D; bid=rFaMYPh6RCA; _pk_id.100001.4cf6=78012081e6779ccd.1750580841.; __yadk_uid=3Gu8udJKmiPINxEc1pMqhsMt3iTILw5x; _vwo_uuid_v2=D1DDB86BDCAAC17D1AC0CE3E327C775A3|2cae6702ce2d9ebdd6552e7f91d2c447; viewed="36492148_31223437_37345711"; __utmz=223695111.1752599205.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _ga=GA1.1.1274053689.1750580849; dbcl2="225204235:K9IRNx2LsCg"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.22520; loc-last-index-location-id="108288"; ll="108288"; _ga_RXNMP372GL=GS2.1.s1753976615$o2$g1$t1753976764$j55$l0$h0; _ga_Y4GN1R87RG=GS2.1.s1753976779$o1$g1$t1753976791$j48$l0$h0; __utmz=30149280.1754045768.26.12.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ap_v=0,6.0; ck=l51l; __utma=30149280.1274053689.1750580849.1754045768.1754050699.27; __utmb=30149280.0.10.1754050699; __utmc=30149280; __utma=223695111.1607170658.1750580849.1753975932.1754050699.16; __utmb=223695111.0.10.1754050699; __utmc=223695111; frodotk_db="0c8f56b0cff3fc5c25ae04126fdbf514"; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1754050700%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Dj5LsrMjHtz4N71TPrjZZFswXuTN2I3JtWMVZOUA1JoavRFidVU8qEt3QhHh89lYn%26wd%3D%26eqid%3Dc34e9cd50159cadd0000000668768aa0%22%5D; _pk_ses.100001.4cf6=1',}session = requests.Session()movie_info = get_movie_info(movie_id, session, headers) # 返回 电影名、影评总数fw = creat_file(movie_info[0])# movie_info[1] = 40 # 默认获取所有影评,想获取多少条影评可以自己填,默认:每页20条get_comments(movie_id, int(movie_info[1]), session, headers, fw[0], fw[1])t2 = time.time()print(f'成功获取{movie_info[1]}条评论,耗时:{(t2-t1)/60:.2f}分钟')if __name__ == '__main__':movie_id = 36438475 # 电影id, 南京照相馆:36809864, 浪浪山小妖怪:36438475get_data(movie_id)
freelybill