feat(PyramidStore): 初始化项目并添加基础配置文件

添加 .gitignore 忽略子仓库的 .git 目录
添加 LICENSE 文件,使用 GNU General Public License v3.0
添加 README.md 说明文档,包含调试示例、免责声明和配置说明
添加 base/localProxy.py 基础代理配置文件
添加版本控制图片文件(二进制差异)
```
This commit is contained in:
2025-10-23 02:14:43 +08:00
commit 3572e29279
356 changed files with 120993 additions and 0 deletions

View File

@@ -0,0 +1,533 @@
import re
import sys
import urllib.parse
import threading
import time
import requests
from pyquery import PyQuery as pq
sys.path.append('..')
from base.spider import Spider
class Spider(Spider):
def __init__(self):
# 基础配置
self.name = '好色TV'
self.host = 'https://hsex.icu/'
self.candidate_hosts = [
"https://hsex.icu/",
"https://hsex1.icu/",
"https://hsex.tv/"
]
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': self.host
}
self.timeout = 5000
# 分类映射关键修复视频分类url_suffix设为空适配list-{pg}.htm格式
self.class_map = {
'视频': {'type_id': 'list', 'url_suffix': ''}, # 修复点1视频分类后缀为空
'周榜': {'type_id': 'top7', 'url_suffix': 'top7'},
'月榜': {'type_id': 'top', 'url_suffix': 'top'},
'5分钟+': {'type_id': '5min', 'url_suffix': '5min'},
'10分钟+': {'type_id': 'long', 'url_suffix': 'long'}
}
def getName(self):
return self.name
def init(self, extend=""):
# 尝试获取最快可用域名
self.host = self.get_fastest_host()
self.headers['Referer'] = self.host
def isVideoFormat(self, url):
if not url:
return False
return any(fmt in url.lower() for fmt in ['.mp4', '.m3u8', '.flv', '.avi'])
def manualVideoCheck(self):
def check(url):
if not self.isVideoFormat(url):
return False
try:
resp = self.fetch(url, headers=self.headers, method='HEAD', timeout=3)
return resp.status_code in (200, 302) and 'video' in resp.headers.get('Content-Type', '')
except:
return False
return check
def get_fastest_host(self):
"""测试候选域名,返回最快可用的"""
results = {}
threads = []
def test_host(url):
try:
start_time = time.time()
resp = requests.head(url, headers=self.headers, timeout=2, allow_redirects=False)
if resp.status_code in (200, 301, 302):
delay = (time.time() - start_time) * 1000
results[url] = delay
else:
results[url] = float('inf')
except:
results[url] = float('inf')
for host in self.candidate_hosts:
t = threading.Thread(target=test_host, args=(host,))
threads.append(t)
t.start()
for t in threads:
t.join()
valid_hosts = [(h, d) for h, d in results.items() if d != float('inf')]
return valid_hosts[0][0] if valid_hosts else self.candidate_hosts[0]
def homeContent(self, filter):
result = {}
# 构造分类列表
classes = []
for name, info in self.class_map.items():
classes.append({
'type_name': name,
'type_id': info['type_id']
})
result['class'] = classes
try:
# 获取首页内容
html = self.fetch_with_retry(self.host, retry=2, timeout=5).text
data = pq(html)
# 提取视频列表
vlist = []
items = data('.row .col-xs-6.col-md-3')
for item in items.items():
try:
title = item('h5').text().strip()
if not title:
continue
# 提取图片URL
style = item('.image').attr('style') or ''
pic_match = re.search(r'url\(["\']?([^"\']+)["\']?\)', style)
vod_pic = pic_match.group(1) if pic_match else ''
if vod_pic and not vod_pic.startswith('http'):
vod_pic = f"{self.host.rstrip('/')}/{vod_pic.lstrip('/')}"
# 提取时长备注
desc = item('.duration').text().strip() or '未知'
# 提取视频ID
href = item('a').attr('href') or ''
if not href:
continue
vod_id = href.split('/')[-1]
if not vod_id.endswith('.htm'):
vod_id += '.htm'
vlist.append({
'vod_id': vod_id,
'vod_name': title,
'vod_pic': vod_pic,
'vod_remarks': desc
})
except Exception as e:
print(f"解析首页视频项失败: {e}")
continue
result['list'] = vlist
except Exception as e:
print(f"首页解析失败: {e}")
result['list'] = []
return result
def homeVideoContent(self):
return []
def categoryContent(self, tid, pg, filter, extend):
result = {}
try:
# 匹配分类信息
cate_info = None
for name, info in self.class_map.items():
if info['type_id'] == tid:
cate_info = info
break
if not cate_info:
result['list'] = []
return result
# 关键修复区分视频分类与其他分类的URL格式
if tid == 'list': # 视频分类type_id为list
url = f"{self.host}list-{pg}.htm" # 格式list-1.htm、list-2.htm
else: # 其他分类(周榜/月榜等xxx_list-{pg}.htm
url = f"{self.host}{cate_info['url_suffix']}_list-{pg}.htm"
# 请求分类页
html = self.fetch(url, headers=self.headers, timeout=8).text
html = html.encode('utf-8', errors='ignore').decode('utf-8')
data = pq(html)
# 提取视频列表
vlist = []
items = data('.row .col-xs-6.col-md-3')
for item in items.items():
try:
title = item('h5').text().strip()
if not title:
continue
style = item('.image').attr('style') or ''
pic_match = re.search(r'url\(["\']?([^"\']+)["\']?\)', style)
vod_pic = pic_match.group(1) if pic_match else ''
if vod_pic and not vod_pic.startswith('http'):
vod_pic = f"{self.host.rstrip('/')}/{vod_pic.lstrip('/')}"
desc = item('.duration').text().strip() or '未知'
href = item('a').attr('href') or ''
if not href:
continue
vod_id = href.split('/')[-1]
if not vod_id.endswith('.htm'):
vod_id += '.htm'
vlist.append({
'vod_id': vod_id,
'vod_name': title,
'vod_pic': vod_pic,
'vod_remarks': desc
})
except Exception as e:
print(f"解析分类视频项失败: {e}")
continue
# 提取总页数
pagecount = 1
try:
pagination = data('.pagination1 li a')
page_nums = []
for a in pagination.items():
text = a.text().strip()
if text.isdigit():
page_nums.append(int(text))
if page_nums:
pagecount = max(page_nums)
except:
pagecount = 1
result['list'] = vlist
result['page'] = pg
result['pagecount'] = pagecount
result['limit'] = len(vlist)
result['total'] = 999999
except Exception as e:
print(f"分类解析失败: {e}")
result['list'] = []
result['page'] = pg
result['pagecount'] = 1
result['limit'] = 0
result['total'] = 0
return result
def detailContent(self, ids):
try:
if not ids or not ids[0]:
return {'list': []}
vod_id = ids[0].strip()
if not vod_id.endswith('.htm'):
vod_id += '.htm'
url = f"{self.host}{vod_id.lstrip('/')}"
html = self.fetch_with_retry(url, retry=2, timeout=8).text
html = html.encode('utf-8', errors='ignore').decode('utf-8')
data = pq(html)
# 提取标题
title = data('.panel-title, .video-title, h1').text().strip() or '未知标题'
# 提取封面图
vod_pic = ''
poster_style = data('.vjs-poster').attr('style') or ''
pic_match = re.search(r'url\(["\']?([^"\']+)["\']?\)', poster_style)
if pic_match:
vod_pic = pic_match.group(1)
if not vod_pic:
vod_pic = data('.video-pic img, .vjs-poster img, .thumbnail img').attr('src') or ''
if vod_pic and not vod_pic.startswith('http'):
vod_pic = f"{self.host}{vod_pic.lstrip('/')}"
# 提取时长和观看量
duration = '未知'
views = '未知'
info_items = data('.panel-body .col-md-3, .video-info .info-item, .info p')
for item in info_items.items():
text = item.text().strip()
if '时长' in text or 'duration' in text.lower():
duration = text.replace('时长:', '').replace('时长', '').strip()
elif '观看' in text or 'views' in text.lower():
views_match = re.search(r'(\d+\.?\d*[kK]?)次观看', text)
if views_match:
views = views_match.group(1)
else:
views = text.replace('观看:', '').replace('观看', '').strip()
remarks = f"{duration} | {views}"
# 提取播放地址
video_url = ''
m3u8_match = re.search(r'videoUrl\s*=\s*["\']([^"\']+\.m3u8)["\']', html)
if m3u8_match:
video_url = m3u8_match.group(1)
if not video_url:
source = data('source[src*=".m3u8"], source[src*=".mp4"]')
video_url = source.attr('src') or ''
if not video_url:
js_matches = re.findall(r'(https?://[^\s"\']+\.(?:m3u8|mp4))', html)
if js_matches:
video_url = js_matches[0]
if video_url and not video_url.startswith('http'):
video_url = f"{self.host}{video_url.lstrip('/')}"
vod = {
'vod_id': vod_id,
'vod_name': title,
'vod_pic': vod_pic,
'vod_remarks': remarks,
'vod_play_from': '好色TV',
'vod_play_url': f'正片${video_url}' if video_url else '正片$暂无地址'
}
return {'list': [vod]}
except Exception as e:
print(f"详情解析失败: {e}")
return {'list': []}
def searchContent(self, key, quick, pg=1):
try:
# 关键词合法性校验
if not key.strip():
print("搜索关键词不能为空")
return {'list': [], 'page': int(pg), 'pagecount': 1, 'limit': 0, 'total': 0}
# 编码关键词
encoded_key = urllib.parse.quote(key.strip(), encoding='utf-8', errors='replace')
# 构造搜索URL
search_url = f"{self.host}search.htm"
params = {
'search': encoded_key,
'page': int(pg)
}
# 发起请求
resp = self.fetch(
url=search_url,
headers=self.headers,
params=params,
timeout=8
)
if resp.status_code not in (200, 302):
print(f"搜索页面请求失败URL{resp.url},状态码:{resp.status_code}")
return {'list': [], 'page': int(pg), 'pagecount': 1, 'limit': 0, 'total': 0}
# 处理页面内容
html = resp.text.encode('utf-8', errors='ignore').decode('utf-8')
data = pq(html)
# 检测无结果场景
no_result_texts = ['没有找到相关视频', '无搜索结果', 'No results found', '未找到匹配内容']
no_result = any(data(f'div:contains("{text}"), p:contains("{text}")').text() for text in no_result_texts)
if no_result:
print(f"搜索关键词「{key}」第{pg}页无结果")
return {'list': [], 'page': int(pg), 'pagecount': 1, 'limit': 0, 'total': 0}
# 解析搜索结果
vlist = []
items = data('.row .col-xs-6.col-md-3')
for item in items.items():
try:
title = item('h5').text().strip()
if not title:
continue
style = item('.image').attr('style') or ''
pic_match = re.search(r'url\(["\']?([^"\']+)["\']?\)', style)
vod_pic = pic_match.group(1) if pic_match else ''
if vod_pic and not vod_pic.startswith(('http://', 'https://')):
vod_pic = f"{self.host.rstrip('/')}/{vod_pic.lstrip('/')}"
desc = item('.duration').text().strip() or '未知时长'
href = item('a').attr('href') or ''
if not href:
continue
vod_id = href.split('/')[-1]
if not vod_id.endswith('.htm'):
vod_id += '.htm'
vlist.append({
'vod_id': vod_id,
'vod_name': title,
'vod_pic': vod_pic,
'vod_remarks': desc
})
except Exception as e:
print(f"解析单条搜索结果失败:{e}(跳过该条)")
continue
# 解析总页数
pagecount = 1
try:
pagination = data('.pagination1 li a')
page_nums = []
for a in pagination.items():
text = a.text().strip()
if text.isdigit():
page_nums.append(int(text))
if page_nums:
pagecount = max(page_nums)
print(f"搜索关键词「{key}」分页解析完成,共{pagecount}")
except Exception as e:
print(f"解析分页失败(默认单页):{e}")
pagecount = 1
# 返回结果修复点2补全page键的引号修正语法错误
total = len(vlist) * pagecount
print(f"搜索关键词「{key}」第{pg}页处理完成,结果{len(vlist)}条,总页数{pagecount}")
return {
'list': vlist,
'page': int(pg), # 原代码此处缺少引号,导致语法错误
'pagecount': pagecount,
'limit': len(vlist),
'total': total
}
except Exception as e:
print(f"搜索功能整体异常:{e}")
return {
'list': [],
'page': int(pg), 'pagecount': 1,
'limit': 0,
'total': 0
}
def playerContent(self, flag, id, vipFlags):
headers = self.headers.copy()
headers.update({
'Referer': self.host,
'Origin': self.host.rstrip('/'),
'Host': urllib.parse.urlparse(self.host).netloc,
})
# 根据rule中的double设置
return {
'parse': 1, # 根据rule中的play_parse设置
'url': id,
'header': headers,
'double': True # 根据rule中的double设置
}
def localProxy(self, param):
try:
url = param['url']
if url and not url.startswith(('http://', 'https://')):
url = f"{self.host.rstrip('/')}/{url.lstrip('/')}"
img_headers = self.headers.copy()
img_headers.update({'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8'})
res = self.fetch(url, headers=img_headers, timeout=10)
content_type = res.headers.get('Content-Type', 'image/jpeg')
return [200, content_type, res.content]
except Exception as e:
print(f"图片代理失败: {e}")
return [200, 'image/jpeg', b'']
def fetch_with_retry(self, url, retry=2, timeout=5):
for i in range(retry + 1):
try:
resp = self.fetch(f'https://vpsdn.leuse.top/proxy?single=true&url={urllib.parse.quote(url)}',headers=self.headers, timeout=timeout)
if resp.status_code in (200, 301, 302):
return resp
print(f"请求{url}返回状态码{resp.status_code},重试中...")
except Exception as e:
print(f"{i+1}次请求{url}失败: {e}")
if i < retry:
time.sleep(0.5)
return type('obj', (object,), {'text': '', 'status_code': 404})
def fetch(self, url, headers=None, timeout=5, method='GET', params=None):
headers = headers or self.headers
params = params or {}
try:
if method.upper() == 'GET':
resp = requests.get(
f'https://vpsdn.leuse.top/proxy?single=true&url={urllib.parse.quote(url)}',
headers=headers,
timeout=timeout,
allow_redirects=True,
params=params # 支持GET请求带参数适配搜索分页
)
elif method.upper() == 'HEAD':
resp = requests.head(
f'https://vpsdn.leuse.top/proxy?single=true&url={urllib.parse.quote(url)}',
headers=headers,
timeout=timeout,
allow_redirects=False,
params=params
)
else:
resp = requests.get( # 默认GET请求兼容其他方法调用
f'https://vpsdn.leuse.top/proxy?single=true&url={urllib.parse.quote(url)}',
headers=headers,
timeout=timeout,
allow_redirects=True,
params=params
)
# 自动适配编码,避免中文乱码
if 'charset' in resp.headers.get('Content-Type', '').lower():
resp.encoding = resp.apparent_encoding
else:
resp.encoding = 'utf-8'
return resp
except Exception as e:
print(f"网络请求失败({url}): {e}")
# 返回统一格式空响应,避免后续逻辑崩溃
return type('obj', (object,), {
'text': '',
'status_code': 500,
'headers': {},
'url': url
})
# ------------------------------
# 可选测试代码(运行时注释或删除,用于验证功能)
# ------------------------------
if __name__ == "__main__":
# 初始化爬虫
spider = Spider()
spider.init()
# 测试首页内容
print("=== 测试首页 ===")
home_data = spider.homeContent(filter='')
print(f"首页分类数:{len(home_data['class'])}")
print(f"首页视频数:{len(home_data['list'])}")
# 测试视频分类(修复后的数据获取)
print("\n=== 测试视频分类第1页 ===")
cate_data = spider.categoryContent(tid='list', pg=1, filter='', extend='')
print(f"视频分类第1页视频数{len(cate_data['list'])}")
print(f"视频分类总页数:{cate_data['pagecount']}")
# 测试搜索功能(修复语法错误后)
print("\n=== 测试搜索(关键词:测试) ===")
search_data = spider.searchContent(key="测试", quick=False, pg=1)
print(f"搜索结果数:{len(search_data['list'])}")
print(f"搜索总页数:{search_data['pagecount']}")