当前位置: 首页 > article >正文

【宅男宅女们的福音】电影天堂最新电影爬取及搜索脚本

多线程电影天堂最新资源爬取脚本、电影搜索脚本
PS:方便大家使用写到了HTML中生成表格。
线程可以在脚本里直接改,测试线程为30时IP可能会被限制访问。[阳光电影是电影天堂的马甲]

环境: Python3

最新电影爬取代码

# -*- coding: utf-8 -*-
import random
import threading
import requests as req
from lxml import etree
from queue import QueueBASE_URL_COM = 'http://www.ygdy8.com'
BASE_URL_NET = 'http://www.ygdy8.net'
THREADS = 20
PAGE_TOTAL = 100HEAD = '<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"><title>阳光电影 - 电影天堂</title><link href="https://cdn.bootcss.com/bootstrap/4.0.0/css/bootstrap.min.css" rel="stylesheet"></head><body><table class="table"><thead class="thead-dark"><tr><th scope="col">#</th><th scope="col">电影名</th><th scope="col">下载地址</th></tr></thead><tbody class="table-hover">'
FOOT = '</tbody></table></body></html>'count = 1def get_headers():user_agent_list = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1','Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6','Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6','Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5','Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3','Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3','Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3','Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3','Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24','Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24']UA = random.choice(user_agent_list)headers = {'User-Agent': UA}return headersdef get_url(list_queue, url_queue):while True:url = list_queue.get()try:res = req.get(url, headers=get_headers())res.encoding = res.apparent_encodinghtml = etree.HTML(res.text)tags = html.xpath('//div[@class="co_content8"]/ul//a')for tag in tags:href = tag.get('href')url_queue.put(href, 1)print('[Subscribe] [%s]' % href)except:print('[Subscribe Error] %s' % url)list_queue.task_done()def get_list(list_queue):lists = [i for i in range(1, PAGE_TOTAL + 1)]list_url = 'http://www.ygdy8.com/html/gndy/dyzz/list_23_%d.html'for i in lists:url = list_url % ilist_queue.put(url, 1)def parse_download(url):res = req.get(url, headers=get_headers())res.encoding = res.apparent_encodinghtml = etree.HTML(res.text)title = html.xpath('//div[@class="bd3r"]//div[@class="title_all"]/h1/font')[0].textdownloads = html.xpath('//div[@id="Zoom"]//table//a/@href')return title, downloadsdef parse_html(url_queue, result_file):while True:global counturl_path = url_queue.get()try:try:url = BASE_URL_COM + url_path(title, downloads) = parse_download(url)except:url = BASE_URL_NET + url_path(title, downloads) = parse_download(url)download = '<hr>'.join(downloads)tr = '<tr><th scope="row">%d</th><td>%s</td><td>%s</td></tr>' % (count, title, download)result_file.write(tr)print('[OK][%d] %s' % (count, title))count = count + 1except:print('[Parse error] %s' % url_path)url_queue.task_done()def thread(thread_name, target, args):for i in range(THREADS):t = threading.Thread(target=target, args=args)t.setDaemon(True)t.start()thread_name.join()def main():list_queue = Queue()url_queue = Queue()get_list(list_queue)thread(list_queue, get_url, (list_queue, url_queue))result_file = open('result.html', 'w')result_file.write(HEAD)thread(url_queue, parse_html, (url_queue, result_file))result_file.write(FOOT)result_file.close()print('End... 老铁记得顶我(TieZi)\nEnd... 老铁记得顶我(TieZi)\nEnd... 老铁记得顶我(TieZi)')if __name__ == '__main__':main()

搜索电影代码

# -*- coding: utf-8 -*-
import sys
import random
import requests as req
from urllib import parse
from lxml import etree
from multiprocessing import PoolBASE_URL = 'http://www.ygdy8.com'
SEARCH_URL = 'http://s.ygdy8.com/plus/so.php?kwtype=0&searchtype=title&pagesize=1000&keyword='# 关键字需要URL字符编码
def get_headers():user_agent_list = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1','Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6','Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6','Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5','Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3','Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3','Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3','Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3','Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24','Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24']ua = random.choice(user_agent_list)headers = {'User-Agent': ua}return headersdef search(keyword):keyword = parse.quote(keyword.encode("gbk"))url = SEARCH_URL + keywordres = req.get(url, headers=get_headers())res.encoding = res.apparent_encodinghtml = etree.HTML(res.text)tags = html.xpath('//div[@class="co_content8"]/ul//a')result_urls = []for tag in tags:url = BASE_URL + tag.get('href')result_urls.append(url)return result_urlsdef parse_html(url):res = req.get(url, headers=get_headers())res.encoding = res.apparent_encodinghtml = etree.HTML(res.text)title = html.xpath('//div[@class="bd3r"]//div[@class="title_all"]/h1/font')[0].textdownloads = html.xpath('//div[@id="Zoom"]//table//a/@href')print('[%s]' % title)for download in downloads:print('[下载链接] [%s]' % download)print('\n|----------------------------------------------------------|\n')if __name__ == '__main__':if len(sys.argv) < 2:print("Usage: python %s movie_name" % sys.argv[0])exit(-1)urls = search(sys.argv[1])pool = Pool()pool.map(parse_html, urls)

最新电影爬取效果



生成后的HTML表格



电影搜索效果



[补图]爬取的下载链接含FTP和磁力链接,片源有磁力链接就会被爬取
爬取结果


搜索结果

 

转载于:https://www.cnblogs.com/ECJTUACM-873284962/p/8560689.html

http://www.lryc.cn/news/2413841.html

相关文章:

  • SQL实战50例:MySQL查询语句练习
  • C语言之指针与结构体
  • 古代奥运会创始人是谁?
  • Android-保姆级初步工作
  • 如何防御DDOS攻击 DDOS攻击是什么意思
  • 常用黑盒测试方法
  • 使用canvas进行图像处理
  • JSBSim学习笔记(1)——简介
  • WakeLock保持后台唤醒状态
  • js disabled属性的添加与删除
  • USACO1.4 母亲的牛奶 Mother's Milk
  • Reflect中MethodInfo使用方法
  • MyEclipse 8.0 M1 下载 (Standard and Pro Editions)
  • 本地连接的ip地址 子网掩码 默认网关 还有dns服务器地址怎么设置? (转自网易博客)
  • OA项目之我的审批(查询会议签字)
  • nodejs 使用async进行BT吧最新电影数据爬取
  • FLASH常见问题
  • C/C++《计算思维综合实践I》参考选题(84题)[2024-05-22]
  • 个人面试总结暨2020年终总结
  • 聊一聊go的单元测试(goconvey、gomonkey、gomock、ginkgo)
  • 乐Pad A1拆机全程
  • 小周恋爱日记网站
  • 恐龙机器人钢索恐龙形态_机器恐龙铁渣2.0:P1S的钢索终于有伴了
  • 如何查找和注册已备案过期域名
  • 【纯转】Div+CSS经典速成教程。
  • 一文实现nnUNet v2 分割肾脏肿瘤数据集KiTS19
  • win篇--winserver2008R2系统自动更新报错:代码:80092004
  • 火狐与IE兼容性总结(待整理,代码有点乱)
  • MDK常用快捷键和操作
  • 企业竞争竞争情报系统的流程整合