tvbox/py/小鸭看看.py
Wang.Luo cad87a83b6 feat(py): 新增小鸭看看影视爬虫接口
- 新增 `小鸭看看.py` 爬虫文件,支持电影、电视剧、综艺、动漫及福利等分类
- 实现首页推荐、分类列表、详情页、播放地址解析及Google站内搜索功能
- 支持多线路播放源和分页筛选,兼容m3u8、mp4等多种视频格式
- 更新 `api.json` 配置,指向新的Python爬虫脚本并移除旧XBPQ配置依赖
2025-09-20 01:21:04 +08:00

489 lines
21 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
import re
import json
import time
from urllib.parse import quote, urljoin, urlparse, parse_qs
import sys
# 导入外部库,现在可以正常使用
from bs4 import BeautifulSoup
import gzip
sys.path.append("..")
from base.spider import Spider
class Spider(Spider):
def __init__(self):
self.name = "小鸭子看看"
self.hosts = {
"main": "https://xiaoyakankan.com",
"tw": "https://tw.xiaoyakankan.com"
}
self.default_host = "tw"
self.ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
# 视频格式支持
self.VIDEO_FORMATS = ['.m3u8', '.mp4', '.flv', '.avi', '.mkv', '.mov']
def getName(self):
return self.name
def init(self, extend=""):
if extend:
try:
config = json.loads(extend)
if config.get("host") in self.hosts:
self.default_host = config["host"]
self.log(f"已切换默认域名至:{self.hosts[self.default_host]}", "INFO")
except:
self.log("初始化参数解析失败使用默认tw子域名", "WARNING")
def log(self, msg, level="INFO"):
print(f"[{level}] [{self.name}] {time.strftime('%Y-%m-%d %H:%M:%S')} - {msg}")
def get_current_host(self):
return self.hosts[self.default_host]
def homeContent(self, filter):
result = {}
result['class'] = [
{"type_name": "电影", "type_id": "10", "land": "1", "filters": [
{"key": "class", "name": "类型", "value": [
{"n": "全部", "v": "10"},
{"n": "动作片", "v": "1001"},
{"n": "喜剧片", "v": "1002"},
{"n": "爱情片", "v": "1003"},
{"n": "科幻片", "v": "1004"},
{"n": "恐怖片", "v": "1005"},
{"n": "剧情片", "v": "1006"},
{"n": "战争片", "v": "1007"},
{"n": "纪录片", "v": "1008"},
{"n": "微电影", "v": "1009"},
{"n": "动漫电影", "v": "1010"},
{"n": "奇幻片", "v": "1011"},
{"n": "动画片", "v": "1013"},
{"n": "犯罪片", "v": "1014"},
{"n": "悬疑片", "v": "1016"},
{"n": "欧美片", "v": "1017"},
{"n": "邵氏电影", "v": "1019"},
{"n": "同性片", "v": "1021"},
{"n": "家庭片", "v": "1024"},
{"n": "古装片", "v": "1025"},
{"n": "历史片", "v": "1026"},
{"n": "4K电影", "v": "1027"}
]}
]},
{"type_name": "连续剧", "type_id": "11", "land": "1", "filters": [
{"key": "class", "name": "地区类型", "value": [
{"n": "全部", "v": "11"},
{"n": "国产剧", "v": "1101"},
{"n": "香港剧", "v": "1102"},
{"n": "台湾剧", "v": "1105"},
{"n": "韩国剧", "v": "1103"},
{"n": "欧美剧", "v": "1104"},
{"n": "日本剧", "v": "1106"},
{"n": "泰国剧", "v": "1108"},
{"n": "港台剧", "v": "1110"},
{"n": "日韩剧", "v": "1111"},
{"n": "东南亚剧", "v": "1112"},
{"n": "海外剧", "v": "1107"}
]}
]},
{"type_name": "综艺", "type_id": "12", "land": "1", "filters": [
{"key": "class", "name": "地区类型", "value": [
{"n": "全部", "v": "12"},
{"n": "内地综艺", "v": "1201"},
{"n": "港台综艺", "v": "1202"},
{"n": "日韩综艺", "v": "1203"},
{"n": "欧美综艺", "v": "1204"},
{"n": "国外综艺", "v": "1205"}
]}
]},
{"type_name": "动漫", "type_id": "13", "land": "1", "filters": [
{"key": "class", "name": "地区类型", "value": [
{"n": "全部", "v": "13"},
{"n": "国产动漫", "v": "1301"},
{"n": "日韩动漫", "v": "1302"},
{"n": "欧美动漫", "v": "1303"},
{"n": "海外动漫", "v": "1305"},
{"n": "里番", "v": "1307"}
]}
]},
{"type_name": "福利", "type_id": "15", "land": "1", "filters": [
{"key": "class", "name": "地区类型", "value": [
{"n": "全部", "v": "15"},
{"n": "韩国情色片", "v": "1551"},
{"n": "日本情色片", "v": "1552"},
{"n": "大陆情色片", "v": "1555"},
{"n": "香港情色片", "v": "1553"},
{"n": "台湾情色片", "v": "1554"},
{"n": "美国情色片", "v": "1556"},
{"n": "欧洲情色片", "v": "1557"},
{"n": "印度情色片", "v": "1558"},
{"n": "东南亚情色片", "v": "1559"},
{"n": "其它情色片", "v": "1550"}
]}
]}
]
# 将所有筛选器数据添加进 result['filters'] 中
result['filters'] = {
"10": result['class'][0]['filters'],
"11": result['class'][1]['filters'],
"12": result['class'][2]['filters'],
"13": result['class'][3]['filters'],
"15": result['class'][4]['filters'],
}
return result
def homeVideoContent(self):
try:
url = self.get_current_host()
r = self.fetch(url, headers={"User-Agent": self.ua})
if r.status_code != 200:
self.log(f"首页推荐请求失败,状态码:{r.status_code}", "ERROR")
return {'list': []}
# 使用新的正则表达式来获取视频列表项
pattern = r'<a class="link" href="(/post/[^"]+\.html)".*?<img[^>]*data-src="([^"]+)".*?alt="([^"]+)".*?(?:<div class="tag1[^>]*>([^<]+)</div>)?.*?(?:<div class="tag2">([^<]+)</div>)?'
matches = re.findall(pattern, r.text, re.DOTALL)
video_list = []
for match in matches[:12]: # 限制12个结果
try:
link, img_src, title, tag1, tag2 = match
vod_id_match = re.search(r'/post/(.*?)\.html', link)
if not vod_id_match:
continue
vod_id = vod_id_match.group(1)
# 组合备注信息
remarks = []
if tag1:
remarks.append(tag1.strip())
if tag2:
remarks.append(tag2.strip())
vod_remarks = " / ".join(remarks) if remarks else "最新"
# 处理图片URL
if img_src.startswith('//'):
img_url = 'https:' + img_src
elif not img_src.startswith('http'):
img_url = urljoin(self.get_current_host(), img_src)
else:
img_url = img_src
vod = {
'vod_id': vod_id,
'vod_name': title.strip(),
'vod_pic': img_url,
'vod_remarks': vod_remarks
}
video_list.append(vod)
except Exception as e:
self.log(f"首页推荐项解析失败:{str(e)}", "ERROR")
continue
self.log(f"首页推荐成功解析{len(video_list)}个项", "INFO")
return {'list': video_list}
except Exception as e:
self.log(f"首页推荐内容获取失败:{str(e)}", "ERROR")
return {'list': []}
def categoryContent(self, tid, pg, filter, extend):
result = {'list': [], 'page': pg, 'pagecount': 1, 'limit': 40, 'total': 0}
try:
# 修复:检查 extend 参数,以支持筛选功能
filter_tid = tid
if extend and 'class' in extend and extend['class']:
filter_tid = extend['class']
# 修复分类URL构建使用 filter_tid
url = f"{self.get_current_host()}/cat/{filter_tid}"
if int(pg) > 1:
url = f"{url}-{pg}"
url = f"{url}.html"
r = self.fetch(url, headers={"User-Agent": self.ua})
if r.status_code != 200:
self.log(f"分类页请求失败URL{url},状态码:{r.status_code}", "ERROR")
return result
# 修复:使用更健壮的正则来提取所有视频列表项
items = re.findall(r'<div class="item">(.*?)<a class="title"', r.text, re.DOTALL)
for item in items:
try:
link_match = re.search(r'<a class="link" href="(/post/[^"]+\.html)"', item)
img_match = re.search(r'<img[^>]*data-src="([^"]+)"', item)
title_match = re.search(r'data-src="[^"]+" alt="([^"]+)"', item)
tag1_match = re.search(r'<div class="tag1[^>]*>([^<]+)</div>', item)
tag2_match = re.search(r'<div class="tag2">([^<]+)</div>', item)
if not link_match or not img_match or not title_match:
continue
link = link_match.group(1)
img_src = img_match.group(1)
title = title_match.group(1).strip()
vod_id = re.search(r'/post/(.*?)\.html', link).group(1)
remarks = []
if tag1_match:
remarks.append(tag1_match.group(1).strip())
if tag2_match:
remarks.append(tag2_match.group(1).strip())
vod_remarks = " / ".join(remarks) if remarks else "分类内容"
# 处理图片URL
if img_src.startswith('//'):
img_url = 'https:' + img_src
elif not img_src.startswith('http'):
img_url = urljoin(self.get_current_host(), img_src)
else:
img_url = img_src
vod = {
'vod_id': vod_id,
'vod_name': title,
'vod_pic': img_url,
'vod_remarks': vod_remarks
}
result['list'].append(vod)
except Exception as e:
self.log(f"分类项解析失败:{str(e)}", "ERROR")
continue
# 修复:使用更健壮的正则来提取分页信息
page_pattern = r'/cat/\d+-(\d+)\.html'
page_matches = re.findall(page_pattern, r.text)
if page_matches:
page_nums = [int(num) for num in page_matches if num.isdigit()]
result['pagecount'] = max(page_nums) if page_nums else 1
else:
result['pagecount'] = int(pg)
self.log(f"分类{tid}{pg}页:解析{len(result['list'])}", "INFO")
return result
except Exception as e:
self.log(f"分类内容获取失败:{str(e)}", "ERROR")
return result
def detailContent(self, ids):
result = {"list": []}
if not ids:
return result
vod_id = ids[0]
try:
detail_url = f"{self.get_current_host()}/post/{vod_id}.html"
r = self.fetch(detail_url, headers={"User-Agent": self.ua})
if r.status_code != 200:
self.log(f"详情页请求失败,状态码:{r.status_code}", "ERROR")
return result
# 提取标题
title_pattern = r'<title>([^<]+)</title>'
title_match = re.search(title_pattern, r.text)
title = title_match.group(1).replace(" - 小鴨看看", "").strip() if title_match else "未知标题"
# 提取封面图
cover_pattern = r'data-poster="([^"]+)"'
cover_match = re.search(cover_pattern, r.text)
cover_url = ""
if cover_match:
cover_url = cover_match.group(1)
if cover_url.startswith('//'):
cover_url = 'https:' + cover_url
elif not cover_url.startswith('http'):
cover_url = urljoin(self.get_current_host(), cover_url)
# 提取描述
desc_pattern = r'<meta name="description" content="([^"]*)"'
desc_match = re.search(desc_pattern, r.text)
desc = desc_match.group(1) if desc_match else ""
# 提取播放线路和剧集
play_sources = []
play_urls = []
# 从JavaScript中提取播放信息
script_pattern = r'var pp\s*=\s*({.*?});'
script_match = re.search(script_pattern, r.text, re.DOTALL)
if script_match:
try:
pp_data = json.loads(script_match.group(1))
if 'lines' in pp_data:
for line in pp_data['lines']:
if len(line) >= 4:
line_name = line[1] if isinstance(line[1], str) else f"线路{line[0]}"
play_sources.append(line_name)
episodes = []
urls = line[3] if isinstance(line[3], list) and len(line[3]) > 0 else []
for idx, url in enumerate(urls):
if isinstance(url, str) and any(url.endswith(fmt) for fmt in self.VIDEO_FORMATS):
# 修复集数命名逻辑
episode_name_match = re.search(r'ep-([\d\w]+)', url)
if episode_name_match:
episode_name = f"{episode_name_match.group(1)}"
elif len(urls) == 1:
episode_name = "全集"
else:
episode_name = f"{idx+1}"
episodes.append(f"{episode_name}${url}")
if episodes:
play_urls.append("#".join(episodes))
except Exception as e:
self.log(f"解析JavaScript播放信息失败{str(e)}", "ERROR")
# 如果没有从JS中提取到尝试从HTML中提取
if not play_sources:
source_pattern = r'<div class="source"[^>]*>.*?<span class="name">([^<]+)</span>.*?<div class="list">(.*?)</div>'
source_matches = re.findall(source_pattern, r.text, re.DOTALL)
for source_name, list_html in source_matches:
play_sources.append(source_name.strip())
# 提取剧集
episode_pattern = r'<a[^>]*data-sou_idx="\d+"[^>]*>([^<]+)</a>'
episode_matches = re.findall(episode_pattern, list_html)
episodes = []
for idx, ep_name in enumerate(episode_matches):
episodes.append(f"{ep_name.strip()}${idx}")
if episodes:
play_urls.append("#".join(episodes))
vod = {
"vod_id": vod_id,
"vod_name": title,
"vod_pic": cover_url,
"vod_content": desc,
"vod_play_from": "$$$".join(play_sources) if play_sources else "",
"vod_play_url": "$$$".join(play_urls) if play_urls else ""
}
result["list"].append(vod)
self.log(f"详情页解析成功ID{vod_id}", "INFO")
return result
except Exception as e:
self.log(f"详情页解析失败ID{vod_id},错误:{str(e)}", "ERROR")
return result
def playerContent(self, flag, id, vipFlags):
try:
# 如果id已经是URL直接返回
if id.startswith('http'):
return {
"parse": 0,
"playUrl": '',
"url": id,
"header": {
"User-Agent": self.ua,
"Referer": self.get_current_host() + "/"
}
}
# 这是一个简单的播放器URL解析如果原始URL本身就是有效的播放地址就直接返回
return {
"parse": 0,
"playUrl": '',
"url": id,
"header": {
"User-Agent": self.ua,
"Referer": self.get_current_host() + "/"
}
}
except Exception as e:
self.log(f"播放地址解析失败:{str(e)}", "ERROR")
return {"parse": 0, "playUrl": '', "url": id, "header": {"User-Agent": self.ua}}
def searchContent(self, key, quick):
result = {"list": []}
try:
# 构造Google搜索URL带站点限定
google_search_url = f"https://www.google.com/search?q={quote(key)}&sitesearch=xiaoyakankan.com"
self.log(f"构造Google搜索URL: {google_search_url}")
headers = {
"User-Agent": self.ua,
"Referer": self.get_current_host(),
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive"
}
r = self.fetch(google_search_url, headers=headers, timeout=10)
if r.status_code != 200:
self.log(f"Google搜索请求失败状态码{r.status_code},内容:{r.text[:200]}", "ERROR")
return result
# 处理gzip压缩响应
if 'gzip' in r.headers.get('Content-Encoding', ''):
r._content = gzip.decompress(r.content)
try:
soup = BeautifulSoup(r.text, 'html.parser')
# 寻找包含链接和标题的元素
search_results_div = soup.find_all('div', {'data-hveid': re.compile(r'\d+')})
for item in search_results_div:
a_tag = item.find('a', href=re.compile(r'https?://'))
if not a_tag:
continue
link = a_tag['href']
# 检查链接是否属于目标站点
if self.get_current_host() not in link:
continue
# 尝试从链接中提取影片ID和标题
vod_id_match = re.search(r'/post/([^/]+)\.html', link)
if not vod_id_match:
continue
vod_id = vod_id_match.group(1)
title_tag = item.find('h3')
title = title_tag.text.strip() if title_tag else "未知标题"
vod = {
"vod_id": vod_id,
"vod_name": title,
# Google搜索结果通常不包含影片封面这里留空
"vod_pic": "",
"vod_remarks": "Google搜索结果"
}
result["list"].append(vod)
except Exception as e:
self.log(f"解析Google搜索结果失败{str(e)}", "ERROR")
return result
self.log(f"Google搜索成功解析{len(result['list'])}个项", "INFO")
return result
except Exception as e:
self.log(f"搜索内容获取失败:{str(e)}", "ERROR")
return result
def isVideoFormat(self, url):
"""判断是否为视频格式"""
return any(url.lower().endswith(fmt) for fmt in self.VIDEO_FORMATS)
def manualVideoCheck(self):
pass
def localProxy(self, param):
pass