import requests
from lxml import etree
import time
BASE_DOMAIN = 'http://www.dytt8.net'HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
}def get_detail_urls(url):"""获取电影详情页面的url:param url: 每一页电影列表的地址url:return:"""response = requests.get(url, headers=HEADERS)html_element = etree.HTML(response.text)detail_urls = html_element.xpath('//table[@class="tbspan"]//a/@href')detail_urls_new = detail_urlsfor index, detail_url in enumerate(detail_urls_new):if detail_url == '/html/gndy/jddy/index.html':detail_urls.remove(detail_url)detail_urls = map(lambda x: BASE_DOMAIN + x, detail_urls)return detail_urlsdef parse_detail_page(detail_url):"""解析电影详情页面:param detail_url: 详情页面的地址:return:"""response = requests.get(detail_url, headers=HEADERS)text = response.content.decode('gbk')html_element = etree.HTML(text)title = html_element.xpath('//div[@class="title_all"]//font[@color="#07519a"]/text()')[0]zoom_element = html_element.xpath('//div[@id="Zoom"]')[0]imgs = zoom_element.xpath(".//img/@src")year, country, type, rating, duration, director, actors, cover, screen_shot, download_url = '', '', '', '', '', '', '', '', '', ''if len(imgs) > 0:cover = imgs[0]if len(imgs) > 1:screen_shot = imgs[1]infos = zoom_element.xpath('.//text()')def parse_info(info, rule):return info.replace(rule, '').strip()for key, info in enumerate(infos):if info.startswith('◎年 代'):year = parse_info(info, '◎年 代')elif info.startswith('◎产 地'):country = parse_info(info, '◎产 地')elif info.startswith('◎类 别'):type = parse_info(info, '◎类 别')elif info.startswith('◎豆瓣评分'):rating = parse_info(info, '◎豆瓣评分')elif info.startswith('◎片 长'):duration = parse_info(info, '◎片 长')elif info.startswith('◎导 演'):director = parse_info(info, '◎导 演')elif info.startswith('◎主 演'):actor_first = parse_info(info, '◎主 演')actors = [actor_first]for index in range(key + 1, len(infos)):item = infos[index].strip()if item.startswith('◎简 介'):breakactors.append(item)elif info.startswith('◎简 介'):for index in range(key + 1, len(infos)):item = infos[index].strip()if item.startswith('【下载地址】'):breakdesc = itemprint(detail_url)if len(html_element.xpath('//td[@bgcolor="#fdfddf"]/a/text()')) > 0:download_url = html_element.xpath('//td[@bgcolor="#fdfddf"]/a/text()')[0]elif len(html_element.xpath('//td[@bgcolor="#fdfddf"]/text()')) > 0:download_url = html_element.xpath('//td[@bgcolor="#fdfddf"]/text()')[0]film = {'title': title,'cover': cover,'screen_shot': screen_shot,'year': year,'country': country,'type': type,'rating': rating,'duration': duration,'director': director,'actors': actors,'desc': desc,'download_url': download_url}return filmdef spider():"""爬虫的入口:return:"""base_url = 'http://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'films = []for index in range(1, 11):print('开始爬第{}页'.format(index))url = base_url.format(index)detail_urls = get_detail_urls(url)for key, detail_url in enumerate(detail_urls):film = parse_detail_page(detail_url)films.append(film)time.sleep(1)print(films)if __name__ == '__main__':spider()